diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /fs/xfs/scrub | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/xfs/scrub')
44 files changed, 19006 insertions, 0 deletions
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c new file mode 100644 index 0000000000..6c6e5eba42 --- /dev/null +++ b/fs/xfs/scrub/agheader.c @@ -0,0 +1,955 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +int +xchk_setup_agheader( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_fs(sc); +} + +/* Superblock */ + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_superblock_xref( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sm->sm_agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_SB_BLOCK(mp); + + error = xchk_ag_init_existing(sc, agno, &sc->sa); + if (!xchk_xref_process_error(sc, agno, agbno, &error)) + return; + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_xref_is_not_inode_chunk(sc, agbno, 1); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); + + /* scrub teardown will take care of sc->sa for us */ +} + +/* + * Scrub the filesystem superblock. + * + * Note: We do /not/ attempt to check AG 0's superblock. Mount is + * responsible for validating all the geometry information in sb 0, so + * if the filesystem is capable of initiating online scrub, then clearly + * sb 0 is ok and we can use its information to check everything else. + */ +int +xchk_superblock( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_dsb *sb; + struct xfs_perag *pag; + xfs_agnumber_t agno; + uint32_t v2_ok; + __be32 features_mask; + int error; + __be16 vernum_mask; + + agno = sc->sm->sm_agno; + if (agno == 0) + return 0; + + /* + * Grab an active reference to the perag structure. If we can't get + * it, we're racing with something that's tearing down the AG, so + * signal that the AG no longer exists. + */ + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ENOENT; + + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); + /* + * The superblock verifier can return several different error codes + * if it thinks the superblock doesn't look right. For a mount these + * would all get bounced back to userspace, but if we're here then the + * fs mounted successfully, which means that this secondary superblock + * is simply incorrect. Treat all these codes the same way we treat + * any corruption. + */ + switch (error) { + case -EINVAL: /* also -EWRONGFS */ + case -ENOSYS: + case -EFBIG: + error = -EFSCORRUPTED; + fallthrough; + default: + break; + } + if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) + goto out_pag; + + sb = bp->b_addr; + + /* + * Verify the geometries match. Fields that are permanently + * set by mkfs are checked; fields that can be updated later + * (and are not propagated to backup superblocks) are preen + * checked. + */ + if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents)) + xchk_block_set_corrupt(sc, bp); + + if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks)) + xchk_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that are set at mkfs time. */ + vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | + XFS_SB_VERSION_NUMBITS | + XFS_SB_VERSION_ALIGNBIT | + XFS_SB_VERSION_DALIGNBIT | + XFS_SB_VERSION_SHAREDBIT | + XFS_SB_VERSION_LOGV2BIT | + XFS_SB_VERSION_SECTORBIT | + XFS_SB_VERSION_EXTFLGBIT | + XFS_SB_VERSION_DIRV2BIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xchk_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that can be set after mkfs time. */ + vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT | + XFS_SB_VERSION_NLINKBIT | + XFS_SB_VERSION_QUOTABIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock)) + xchk_block_set_corrupt(sc, bp); + + if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname))) + xchk_block_set_preen(sc, bp); + + if (sb->sb_blocklog != mp->m_sb.sb_blocklog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_sectlog != mp->m_sb.sb_sectlog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_inodelog != mp->m_sb.sb_inodelog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_inopblog != mp->m_sb.sb_inopblog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_agblklog != mp->m_sb.sb_agblklog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rextslog != mp->m_sb.sb_rextslog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct) + xchk_block_set_preen(sc, bp); + + /* + * Skip the summary counters since we track them in memory anyway. + * sb_icount, sb_ifree, sb_fdblocks, sb_frexents + */ + + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xchk_block_set_preen(sc, bp); + + /* + * Skip the quota flags since repair will force quotacheck. + * sb_qflags + */ + + if (sb->sb_flags != mp->m_sb.sb_flags) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit)) + xchk_block_set_corrupt(sc, bp); + + /* Do we see any invalid bits in sb_features2? */ + if (!xfs_sb_version_hasmorebits(&mp->m_sb)) { + if (sb->sb_features2 != 0) + xchk_block_set_corrupt(sc, bp); + } else { + v2_ok = XFS_SB_VERSION2_OKBITS; + if (xfs_sb_is_v5(&mp->m_sb)) + v2_ok |= XFS_SB_VERSION2_CRCBIT; + + if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_features2 != sb->sb_bad_features2) + xchk_block_set_preen(sc, bp); + } + + /* Check sb_features2 flags that are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT | + XFS_SB_VERSION2_PROJID32BIT | + XFS_SB_VERSION2_CRCBIT | + XFS_SB_VERSION2_FTYPE); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xchk_block_set_corrupt(sc, bp); + + /* Check sb_features2 flags that can be set after mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xchk_block_set_preen(sc, bp); + + if (!xfs_has_crc(mp)) { + /* all v5 fields must be zero */ + if (memchr_inv(&sb->sb_features_compat, 0, + sizeof(struct xfs_dsb) - + offsetof(struct xfs_dsb, sb_features_compat))) + xchk_block_set_corrupt(sc, bp); + } else { + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) + xchk_block_set_corrupt(sc, bp); + + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) + xchk_block_set_corrupt(sc, bp); + + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); + + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) + xchk_block_set_corrupt(sc, bp); + + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); + + /* Don't care about sb_crc */ + + if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xchk_block_set_preen(sc, bp); + + /* Don't care about sb_lsn */ + } + + if (xfs_has_metauuid(mp)) { + /* The metadata UUID must be the same for all supers */ + if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + xchk_block_set_corrupt(sc, bp); + } + + /* Everything else must be zero. */ + if (memchr_inv(sb + 1, 0, + BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + xchk_block_set_corrupt(sc, bp); + + xchk_superblock_xref(sc, bp); +out_pag: + xfs_perag_put(pag); + return error; +} + +/* AGF */ + +/* Tally freespace record lengths. */ +STATIC int +xchk_agf_record_bno_lengths( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + xfs_extlen_t *blocks = priv; + + (*blocks) += rec->ar_blockcount; + return 0; +} + +/* Check agf_freeblks */ +static inline void +xchk_agf_xref_freeblks( + struct xfs_scrub *sc) +{ + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_extlen_t blocks = 0; + int error; + + if (!sc->sa.bno_cur) + return; + + error = xfs_alloc_query_all(sc->sa.bno_cur, + xchk_agf_record_bno_lengths, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_freeblks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross reference the AGF with the cntbt (freespace by length btree) */ +static inline void +xchk_agf_xref_cntbt( + struct xfs_scrub *sc) +{ + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t agbno; + xfs_extlen_t blocks; + int have; + int error; + + if (!sc->sa.cnt_cur) + return; + + /* Any freespace at all? */ + error = xfs_alloc_lookup_le(sc->sa.cnt_cur, 0, -1U, &have); + if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have) { + if (agf->agf_freeblks != cpu_to_be32(0)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); + return; + } + + /* Check agf_longest */ + error = xfs_alloc_get_rec(sc->sa.cnt_cur, &agbno, &blocks, &have); + if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have || blocks != be32_to_cpu(agf->agf_longest)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check the btree block counts in the AGF against the btrees. */ +STATIC void +xchk_agf_xref_btreeblks( + struct xfs_scrub *sc) +{ + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t blocks; + xfs_agblock_t btreeblks; + int error; + + /* agf_btreeblks didn't exist before lazysbcount */ + if (!xfs_has_lazysbcount(sc->mp)) + return; + + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ + if (sc->sa.rmap_cur) { + error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + btreeblks = blocks - 1; + if (blocks != be32_to_cpu(agf->agf_rmap_blocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); + } else { + btreeblks = 0; + } + + /* + * No rmap cursor; we can't xref if we have the rmapbt feature. + * We also can't do it if we're missing the free space btree cursors. + */ + if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) || + !sc->sa.bno_cur || !sc->sa.cnt_cur) + return; + + /* Check agf_btreeblks */ + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + btreeblks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + btreeblks += blocks - 1; + + if (btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check agf_refcount_blocks against tree size */ +static inline void +xchk_agf_xref_refcblks( + struct xfs_scrub *sc) +{ + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t blocks; + int error; + + if (!sc->sa.refc_cur) + return; + + error = xfs_btree_count_blocks(sc->sa.refc_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_refcount_blocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_agf_xref( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGF_BLOCK(mp); + + xchk_ag_btcur_init(sc, &sc->sa); + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_agf_xref_freeblks(sc); + xchk_agf_xref_cntbt(sc); + xchk_xref_is_not_inode_chunk(sc, agbno, 1); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_agf_xref_btreeblks(sc); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); + xchk_agf_xref_refcblks(sc); + + /* scrub teardown will take care of sc->sa for us */ +} + +/* Scrub the AGF. */ +int +xchk_agf( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf; + struct xfs_perag *pag; + xfs_agnumber_t agno = sc->sm->sm_agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agblock_t agfl_first; + xfs_agblock_t agfl_last; + xfs_agblock_t agfl_count; + xfs_agblock_t fl_count; + int level; + int error = 0; + + error = xchk_ag_read_headers(sc, agno, &sc->sa); + if (!xchk_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) + goto out; + xchk_buffer_recheck(sc, sc->sa.agf_bp); + + agf = sc->sa.agf_bp->b_addr; + pag = sc->sa.pag; + + /* Check the AG length */ + eoag = be32_to_cpu(agf->agf_length); + if (eoag != pag->block_count) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + /* Check the AGF btree roots and levels */ + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + if (level <= 0 || level > mp->m_alloc_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + if (level <= 0 || level > mp->m_alloc_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + if (xfs_has_rmapbt(mp)) { + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + if (level <= 0 || level > mp->m_rmap_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + } + + if (xfs_has_reflink(mp)) { + agbno = be32_to_cpu(agf->agf_refcount_root); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_refcount_level); + if (level <= 0 || level > mp->m_refc_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + } + + /* Check the AGFL counters */ + agfl_first = be32_to_cpu(agf->agf_flfirst); + agfl_last = be32_to_cpu(agf->agf_fllast); + agfl_count = be32_to_cpu(agf->agf_flcount); + if (agfl_last > agfl_first) + fl_count = agfl_last - agfl_first + 1; + else + fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1; + if (agfl_count != 0 && fl_count != agfl_count) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + /* Do the incore counters match? */ + if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (xfs_has_lazysbcount(sc->mp) && + pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + + xchk_agf_xref(sc); +out: + return error; +} + +/* AGFL */ + +struct xchk_agfl_info { + /* Number of AGFL entries that the AGF claims are in use. */ + unsigned int agflcount; + + /* Number of AGFL entries that we found. */ + unsigned int nr_entries; + + /* Buffer to hold AGFL entries for extent checking. */ + xfs_agblock_t *entries; + + struct xfs_buf *agfl_bp; + struct xfs_scrub *sc; +}; + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_agfl_block_xref( + struct xfs_scrub *sc, + xfs_agblock_t agbno) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_xref_is_not_inode_chunk(sc, agbno, 1); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); +} + +/* Scrub an AGFL block. */ +STATIC int +xchk_agfl_block( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xchk_agfl_info *sai = priv; + struct xfs_scrub *sc = sai->sc; + + if (xfs_verify_agbno(sc->sa.pag, agbno) && + sai->nr_entries < sai->agflcount) + sai->entries[sai->nr_entries++] = agbno; + else + xchk_block_set_corrupt(sc, sai->agfl_bp); + + xchk_agfl_block_xref(sc, agbno); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; +} + +static int +xchk_agblock_cmp( + const void *pa, + const void *pb) +{ + const xfs_agblock_t *a = pa; + const xfs_agblock_t *b = pb; + + return (int)*a - (int)*b; +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_agfl_xref( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGFL_BLOCK(mp); + + xchk_ag_btcur_init(sc, &sc->sa); + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_xref_is_not_inode_chunk(sc, agbno, 1); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); + + /* + * Scrub teardown will take care of sc->sa for us. Leave sc->sa + * active so that the agfl block xref can use it too. + */ +} + +/* Scrub the AGFL. */ +int +xchk_agfl( + struct xfs_scrub *sc) +{ + struct xchk_agfl_info sai = { + .sc = sc, + }; + struct xfs_agf *agf; + xfs_agnumber_t agno = sc->sm->sm_agno; + unsigned int i; + int error; + + /* Lock the AGF and AGI so that nobody can touch this AG. */ + error = xchk_ag_read_headers(sc, agno, &sc->sa); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + if (!sc->sa.agf_bp) + return -EFSCORRUPTED; + + /* Try to read the AGFL, and verify its structure if we get it. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + xchk_buffer_recheck(sc, sai.agfl_bp); + + xchk_agfl_xref(sc); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Allocate buffer to ensure uniqueness of AGFL entries. */ + agf = sc->sa.agf_bp->b_addr; + sai.agflcount = be32_to_cpu(agf->agf_flcount); + if (sai.agflcount > xfs_agfl_size(sc->mp)) { + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + goto out; + } + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), + XCHK_GFP_FLAGS); + if (!sai.entries) { + error = -ENOMEM; + goto out; + } + + /* Check the blocks in the AGFL. */ + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, + xchk_agfl_block, &sai); + if (error == -ECANCELED) { + error = 0; + goto out_free; + } + if (error) + goto out_free; + + if (sai.agflcount != sai.nr_entries) { + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + goto out_free; + } + + /* Sort entries, check for duplicates. */ + sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]), + xchk_agblock_cmp, NULL); + for (i = 1; i < sai.nr_entries; i++) { + if (sai.entries[i] == sai.entries[i - 1]) { + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + break; + } + } + +out_free: + kvfree(sai.entries); +out: + return error; +} + +/* AGI */ + +/* Check agi_count/agi_freecount */ +static inline void +xchk_agi_xref_icounts( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agino_t icount; + xfs_agino_t freecount; + int error; + + if (!sc->sa.ino_cur) + return; + + error = xfs_ialloc_count_inodes(sc->sa.ino_cur, &icount, &freecount); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (be32_to_cpu(agi->agi_count) != icount || + be32_to_cpu(agi->agi_freecount) != freecount) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); +} + +/* Check agi_[fi]blocks against tree size */ +static inline void +xchk_agi_xref_fiblocks( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agblock_t blocks; + int error = 0; + + if (!xfs_has_inobtcounts(sc->mp)) + return; + + if (sc->sa.ino_cur) { + error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_iblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_fblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_agi_xref( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGI_BLOCK(mp); + + xchk_ag_btcur_init(sc, &sc->sa); + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_xref_is_not_inode_chunk(sc, agbno, 1); + xchk_agi_xref_icounts(sc); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); + xchk_agi_xref_fiblocks(sc); + + /* scrub teardown will take care of sc->sa for us */ +} + +/* Scrub the AGI. */ +int +xchk_agi( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agi *agi; + struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(sc->mp); + xfs_agnumber_t agno = sc->sm->sm_agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agino_t agino; + xfs_agino_t first_agino; + xfs_agino_t last_agino; + xfs_agino_t icount; + int i; + int level; + int error = 0; + + error = xchk_ag_read_headers(sc, agno, &sc->sa); + if (!xchk_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) + goto out; + xchk_buffer_recheck(sc, sc->sa.agi_bp); + + agi = sc->sa.agi_bp->b_addr; + pag = sc->sa.pag; + + /* Check the AG length */ + eoag = be32_to_cpu(agi->agi_length); + if (eoag != pag->block_count) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check btree roots and levels */ + agbno = be32_to_cpu(agi->agi_root); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_level); + if (level <= 0 || level > igeo->inobt_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + if (xfs_has_finobt(mp)) { + agbno = be32_to_cpu(agi->agi_free_root); + if (!xfs_verify_agbno(pag, agbno)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_free_level); + if (level <= 0 || level > igeo->inobt_maxlevels) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + } + + /* Check inode counters */ + xfs_agino_range(mp, agno, &first_agino, &last_agino); + icount = be32_to_cpu(agi->agi_count); + if (icount > last_agino - first_agino + 1 || + icount < be32_to_cpu(agi->agi_freecount)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check inode pointers */ + agino = be32_to_cpu(agi->agi_newino); + if (!xfs_verify_agino_or_null(pag, agino)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + agino = be32_to_cpu(agi->agi_dirino); + if (!xfs_verify_agino_or_null(pag, agino)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check unlinked inode buckets */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + agino = be32_to_cpu(agi->agi_unlinked[i]); + if (!xfs_verify_agino_or_null(pag, agino)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + } + + if (agi->agi_pad32 != cpu_to_be32(0)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Do the incore counters match? */ + if (pag->pagi_count != be32_to_cpu(agi->agi_count)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + + xchk_agi_xref(sc); +out: + return error; +} diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c new file mode 100644 index 0000000000..876a2f41b0 --- /dev/null +++ b/fs/xfs/scrub/agheader_repair.c @@ -0,0 +1,1035 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/reap.h" + +/* Superblock */ + +/* Repair the superblock. */ +int +xrep_superblock( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_agnumber_t agno; + int error; + + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */ + agno = sc->sm->sm_agno; + if (agno == 0) + return -EOPNOTSUPP; + + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy AG 0's superblock to this one. */ + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return error; +} + +/* AGF */ + +struct xrep_agf_allocbt { + struct xfs_scrub *sc; + xfs_agblock_t freeblks; + xfs_agblock_t longest; +}; + +/* Record free space shape information. */ +STATIC int +xrep_agf_walk_allocbt( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xrep_agf_allocbt *raa = priv; + int error = 0; + + if (xchk_should_terminate(raa->sc, &error)) + return error; + + raa->freeblks += rec->ar_blockcount; + if (rec->ar_blockcount > raa->longest) + raa->longest = rec->ar_blockcount; + return error; +} + +/* Does this AGFL block look sane? */ +STATIC int +xrep_agf_check_agfl_block( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xfs_scrub *sc = priv; + + if (!xfs_verify_agbno(sc->sa.pag, agbno)) + return -EFSCORRUPTED; + return 0; +} + +/* + * Offset within the xrep_find_ag_btree array for each btree type. Avoid the + * XFS_BTNUM_ names here to avoid creating a sparse array. + */ +enum { + XREP_AGF_BNOBT = 0, + XREP_AGF_CNTBT, + XREP_AGF_RMAPBT, + XREP_AGF_REFCOUNTBT, + XREP_AGF_END, + XREP_AGF_MAX +}; + +/* Check a btree root candidate. */ +static inline bool +xrep_check_btree_root( + struct xfs_scrub *sc, + struct xrep_find_ag_btree *fab) +{ + return xfs_verify_agbno(sc->sa.pag, fab->root) && + fab->height <= fab->maxlevels; +} + +/* + * Given the btree roots described by *fab, find the roots, check them for + * sanity, and pass the root data back out via *fab. + * + * This is /also/ a chicken and egg problem because we have to use the rmapbt + * (rooted in the AGF) to find the btrees rooted in the AGF. We also have no + * idea if the btrees make any sense. If we hit obvious corruptions in those + * btrees we'll bail out. + */ +STATIC int +xrep_agf_find_btrees( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp, + struct xrep_find_ag_btree *fab, + struct xfs_buf *agfl_bp) +{ + struct xfs_agf *old_agf = agf_bp->b_addr; + int error; + + /* Go find the root data. */ + error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp); + if (error) + return error; + + /* We must find the bnobt, cntbt, and rmapbt roots. */ + if (!xrep_check_btree_root(sc, &fab[XREP_AGF_BNOBT]) || + !xrep_check_btree_root(sc, &fab[XREP_AGF_CNTBT]) || + !xrep_check_btree_root(sc, &fab[XREP_AGF_RMAPBT])) + return -EFSCORRUPTED; + + /* + * We relied on the rmapbt to reconstruct the AGF. If we get a + * different root then something's seriously wrong. + */ + if (fab[XREP_AGF_RMAPBT].root != + be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi])) + return -EFSCORRUPTED; + + /* We must find the refcountbt root if that feature is enabled. */ + if (xfs_has_reflink(sc->mp) && + !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT])) + return -EFSCORRUPTED; + + return 0; +} + +/* + * Reinitialize the AGF header, making an in-core copy of the old contents so + * that we know which in-core state needs to be reinitialized. + */ +STATIC void +xrep_agf_init_header( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp, + struct xfs_agf *old_agf) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = agf_bp->b_addr; + + memcpy(old_agf, agf, sizeof(*old_agf)); + memset(agf, 0, BBTOB(agf_bp->b_length)); + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(pag->pag_agno); + agf->agf_length = cpu_to_be32(pag->block_count); + agf->agf_flfirst = old_agf->agf_flfirst; + agf->agf_fllast = old_agf->agf_fllast; + agf->agf_flcount = old_agf->agf_flcount; + if (xfs_has_crc(mp)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + + /* Mark the incore AGF data stale until we're done fixing things. */ + ASSERT(xfs_perag_initialised_agf(pag)); + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); +} + +/* Set btree root information in an AGF. */ +STATIC void +xrep_agf_set_roots( + struct xfs_scrub *sc, + struct xfs_agf *agf, + struct xrep_find_ag_btree *fab) +{ + agf->agf_roots[XFS_BTNUM_BNOi] = + cpu_to_be32(fab[XREP_AGF_BNOBT].root); + agf->agf_levels[XFS_BTNUM_BNOi] = + cpu_to_be32(fab[XREP_AGF_BNOBT].height); + + agf->agf_roots[XFS_BTNUM_CNTi] = + cpu_to_be32(fab[XREP_AGF_CNTBT].root); + agf->agf_levels[XFS_BTNUM_CNTi] = + cpu_to_be32(fab[XREP_AGF_CNTBT].height); + + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(fab[XREP_AGF_RMAPBT].root); + agf->agf_levels[XFS_BTNUM_RMAPi] = + cpu_to_be32(fab[XREP_AGF_RMAPBT].height); + + if (xfs_has_reflink(sc->mp)) { + agf->agf_refcount_root = + cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root); + agf->agf_refcount_level = + cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height); + } +} + +/* Update all AGF fields which derive from btree contents. */ +STATIC int +xrep_agf_calc_from_btrees( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp) +{ + struct xrep_agf_allocbt raa = { .sc = sc }; + struct xfs_btree_cur *cur = NULL; + struct xfs_agf *agf = agf_bp->b_addr; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t btreeblks; + xfs_agblock_t blocks; + int error; + + /* Update the AGF counters from the bnobt. */ + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); + if (error) + goto err; + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + btreeblks = blocks - 1; + agf->agf_freeblks = cpu_to_be32(raa.freeblks); + agf->agf_longest = cpu_to_be32(raa.longest); + + /* Update the AGF counters from the cntbt. */ + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + btreeblks += blocks - 1; + + /* Update the AGF counters from the rmapbt. */ + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agf->agf_rmap_blocks = cpu_to_be32(blocks); + btreeblks += blocks - 1; + + agf->agf_btreeblks = cpu_to_be32(btreeblks); + + /* Update the AGF counters from the refcountbt. */ + if (xfs_has_reflink(mp)) { + cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag); + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agf->agf_refcount_blocks = cpu_to_be32(blocks); + } + + return 0; +err: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Commit the new AGF and reinitialize the incore state. */ +STATIC int +xrep_agf_commit_new( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp) +{ + struct xfs_perag *pag; + struct xfs_agf *agf = agf_bp->b_addr; + + /* Trigger fdblocks recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF); + xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1); + + /* Now reinitialize the in-core counters we changed. */ + pag = sc->sa.pag; + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); + pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_longest = be32_to_cpu(agf->agf_longest); + pag->pagf_levels[XFS_BTNUM_BNOi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); + pag->pagf_levels[XFS_BTNUM_CNTi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + pag->pagf_levels[XFS_BTNUM_RMAPi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + + return 0; +} + +/* Repair the AGF. v5 filesystems only. */ +int +xrep_agf( + struct xfs_scrub *sc) +{ + struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { + [XREP_AGF_BNOBT] = { + .rmap_owner = XFS_RMAP_OWN_AG, + .buf_ops = &xfs_bnobt_buf_ops, + .maxlevels = sc->mp->m_alloc_maxlevels, + }, + [XREP_AGF_CNTBT] = { + .rmap_owner = XFS_RMAP_OWN_AG, + .buf_ops = &xfs_cntbt_buf_ops, + .maxlevels = sc->mp->m_alloc_maxlevels, + }, + [XREP_AGF_RMAPBT] = { + .rmap_owner = XFS_RMAP_OWN_AG, + .buf_ops = &xfs_rmapbt_buf_ops, + .maxlevels = sc->mp->m_rmap_maxlevels, + }, + [XREP_AGF_REFCOUNTBT] = { + .rmap_owner = XFS_RMAP_OWN_REFC, + .buf_ops = &xfs_refcountbt_buf_ops, + .maxlevels = sc->mp->m_refc_maxlevels, + }, + [XREP_AGF_END] = { + .buf_ops = NULL, + }, + }; + struct xfs_agf old_agf; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agf_bp; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + /* + * Make sure we have the AGF buffer, as scrub might have decided it + * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. + */ + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); + if (error) + return error; + agf_bp->b_ops = &xfs_agf_buf_ops; + agf = agf_bp->b_addr; + + /* + * Load the AGFL so that we can screen out OWN_AG blocks that are on + * the AGFL now; these blocks might have once been part of the + * bno/cnt/rmap btrees but are not now. This is a chicken and egg + * problem: the AGF is corrupt, so we have to trust the AGFL contents + * because we can't do any serious cross-referencing with any of the + * btrees rooted in the AGF. If the AGFL contents are obviously bad + * then we'll bail out. + */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + /* + * Spot-check the AGFL blocks; if they're obviously corrupt then + * there's nothing we can do but bail out. + */ + error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp, + xrep_agf_check_agfl_block, sc); + if (error) + return error; + + /* + * Find the AGF btree roots. This is also a chicken-and-egg situation; + * see the function for more details. + */ + error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Start rewriting the header and implant the btrees we found. */ + xrep_agf_init_header(sc, agf_bp, &old_agf); + xrep_agf_set_roots(sc, agf, fab); + error = xrep_agf_calc_from_btrees(sc, agf_bp); + if (error) + goto out_revert; + + /* Commit the changes and reinitialize incore state. */ + return xrep_agf_commit_new(sc, agf_bp); + +out_revert: + /* Mark the incore AGF state stale and revert the AGF. */ + clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate); + memcpy(agf, &old_agf, sizeof(old_agf)); + return error; +} + +/* AGFL */ + +struct xrep_agfl { + /* Bitmap of alleged AGFL blocks that we're not going to add. */ + struct xagb_bitmap crossed; + + /* Bitmap of other OWN_AG metadata blocks. */ + struct xagb_bitmap agmetablocks; + + /* Bitmap of free space. */ + struct xagb_bitmap *freesp; + + /* rmapbt cursor for finding crosslinked blocks */ + struct xfs_btree_cur *rmap_cur; + + struct xfs_scrub *sc; +}; + +/* Record all OWN_AG (free space btree) information from the rmap data. */ +STATIC int +xrep_agfl_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_agfl *ra = priv; + int error = 0; + + if (xchk_should_terminate(ra->sc, &error)) + return error; + + /* Record all the OWN_AG blocks. */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(ra->freesp, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + } + + return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur); +} + +/* Strike out the blocks that are cross-linked according to the rmapbt. */ +STATIC int +xrep_agfl_check_extent( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xrep_agfl *ra = priv; + xfs_agblock_t agbno = start; + xfs_agblock_t last_agbno = agbno + len - 1; + int error; + + while (agbno <= last_agbno) { + bool other_owners; + + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, + &XFS_RMAP_OINFO_AG, &other_owners); + if (error) + return error; + + if (other_owners) { + error = xagb_bitmap_set(&ra->crossed, agbno, 1); + if (error) + return error; + } + + if (xchk_should_terminate(ra->sc, &error)) + return error; + agbno++; + } + + return 0; +} + +/* + * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce + * which blocks belong to the AGFL. + * + * Compute the set of old AGFL blocks by subtracting from the list of OWN_AG + * blocks the list of blocks owned by all other OWN_AG metadata (bnobt, cntbt, + * rmapbt). These are the old AGFL blocks, so return that list and the number + * of blocks we're actually going to put back on the AGFL. + */ +STATIC int +xrep_agfl_collect_blocks( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp, + struct xagb_bitmap *agfl_extents, + xfs_agblock_t *flcount) +{ + struct xrep_agfl ra; + struct xfs_mount *mp = sc->mp; + struct xfs_btree_cur *cur; + int error; + + ra.sc = sc; + ra.freesp = agfl_extents; + xagb_bitmap_init(&ra.agmetablocks); + xagb_bitmap_init(&ra.crossed); + + /* Find all space used by the free space btrees & rmapbt. */ + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); + xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; + + /* Find all blocks currently being used by the bnobt. */ + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); + xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; + + /* Find all blocks currently being used by the cntbt. */ + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); + xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; + + /* + * Drop the freesp meta blocks that are in use by btrees. + * The remaining blocks /should/ be AGFL blocks. + */ + error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks); + if (error) + goto out_bmp; + + /* Strike out the blocks that are cross-linked. */ + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); + xfs_btree_del_cursor(ra.rmap_cur, error); + if (error) + goto out_bmp; + error = xagb_bitmap_disunion(agfl_extents, &ra.crossed); + if (error) + goto out_bmp; + + /* + * Calculate the new AGFL size. If we found more blocks than fit in + * the AGFL we'll free them later. + */ + *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents), + xfs_agfl_size(mp)); + +out_bmp: + xagb_bitmap_destroy(&ra.crossed); + xagb_bitmap_destroy(&ra.agmetablocks); + return error; +} + +/* Update the AGF and reset the in-core state. */ +STATIC void +xrep_agfl_update_agf( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp, + xfs_agblock_t flcount) +{ + struct xfs_agf *agf = agf_bp->b_addr; + + ASSERT(flcount <= xfs_agfl_size(sc->mp)); + + /* Trigger fdblocks recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* Update the AGF counters. */ + if (xfs_perag_initialised_agf(sc->sa.pag)) { + sc->sa.pag->pagf_flcount = flcount; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, + &sc->sa.pag->pag_opstate); + } + agf->agf_flfirst = cpu_to_be32(0); + agf->agf_flcount = cpu_to_be32(flcount); + if (flcount) + agf->agf_fllast = cpu_to_be32(flcount - 1); + else + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1); + + xfs_alloc_log_agf(sc->tp, agf_bp, + XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); +} + +struct xrep_agfl_fill { + struct xagb_bitmap used_extents; + struct xfs_scrub *sc; + __be32 *agfl_bno; + xfs_agblock_t flcount; + unsigned int fl_off; +}; + +/* Fill the AGFL with whatever blocks are in this extent. */ +static int +xrep_agfl_fill( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xrep_agfl_fill *af = priv; + struct xfs_scrub *sc = af->sc; + xfs_agblock_t agbno = start; + int error; + + trace_xrep_agfl_insert(sc->sa.pag, agbno, len); + + while (agbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); + + error = xagb_bitmap_set(&af->used_extents, start, agbno - 1); + if (error) + return error; + + if (af->fl_off == af->flcount) + return -ECANCELED; + + return 0; +} + +/* Write out a totally new AGFL. */ +STATIC int +xrep_agfl_init_header( + struct xfs_scrub *sc, + struct xfs_buf *agfl_bp, + struct xagb_bitmap *agfl_extents, + xfs_agblock_t flcount) +{ + struct xrep_agfl_fill af = { + .sc = sc, + .flcount = flcount, + }; + struct xfs_mount *mp = sc->mp; + struct xfs_agfl *agfl; + int error; + + ASSERT(flcount <= xfs_agfl_size(mp)); + + /* + * Start rewriting the header by setting the bno[] array to + * NULLAGBLOCK, then setting AGFL header fields. + */ + agfl = XFS_BUF_TO_AGFL(agfl_bp); + memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); + + /* + * Fill the AGFL with the remaining blocks. If agfl_extents has more + * blocks than fit in the AGFL, they will be freed in a subsequent + * step. + */ + xagb_bitmap_init(&af.used_extents); + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); + if (error) + return error; + + /* Write new AGFL to disk. */ + xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); + xagb_bitmap_destroy(&af.used_extents); + return 0; +} + +/* Repair the AGFL. */ +int +xrep_agfl( + struct xfs_scrub *sc) +{ + struct xagb_bitmap agfl_extents; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agf_bp; + struct xfs_buf *agfl_bp; + xfs_agblock_t flcount; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + xagb_bitmap_init(&agfl_extents); + + /* + * Read the AGF so that we can query the rmapbt. We hope that there's + * nothing wrong with the AGF, but all the AG header repair functions + * have this chicken-and-egg problem. + */ + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); + if (error) + return error; + + /* + * Make sure we have the AGFL buffer, as scrub might have decided it + * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. + */ + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); + if (error) + return error; + agfl_bp->b_ops = &xfs_agfl_buf_ops; + + /* Gather all the extents we're going to put on the new AGFL. */ + error = xrep_agfl_collect_blocks(sc, agf_bp, &agfl_extents, &flcount); + if (error) + goto err; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err; + + /* + * Update AGF and AGFL. We reset the global free block counter when + * we adjust the AGF flcount (which can fail) so avoid updating any + * buffers until we know that part works. + */ + xrep_agfl_update_agf(sc, agf_bp, flcount); + error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + if (error) + goto err; + + /* + * Ok, the AGFL should be ready to go now. Roll the transaction to + * make the new AGFL permanent before we start using it to return + * freespace overflow to the freespace btrees. + */ + sc->sa.agf_bp = agf_bp; + error = xrep_roll_ag_trans(sc); + if (error) + goto err; + + /* Dump any AGFL overflow. */ + error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + XFS_AG_RESV_AGFL); +err: + xagb_bitmap_destroy(&agfl_extents); + return error; +} + +/* AGI */ + +/* + * Offset within the xrep_find_ag_btree array for each btree type. Avoid the + * XFS_BTNUM_ names here to avoid creating a sparse array. + */ +enum { + XREP_AGI_INOBT = 0, + XREP_AGI_FINOBT, + XREP_AGI_END, + XREP_AGI_MAX +}; + +/* + * Given the inode btree roots described by *fab, find the roots, check them + * for sanity, and pass the root data back out via *fab. + */ +STATIC int +xrep_agi_find_btrees( + struct xfs_scrub *sc, + struct xrep_find_ag_btree *fab) +{ + struct xfs_buf *agf_bp; + struct xfs_mount *mp = sc->mp; + int error; + + /* Read the AGF. */ + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); + if (error) + return error; + + /* Find the btree roots. */ + error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL); + if (error) + return error; + + /* We must find the inobt root. */ + if (!xrep_check_btree_root(sc, &fab[XREP_AGI_INOBT])) + return -EFSCORRUPTED; + + /* We must find the finobt root if that feature is enabled. */ + if (xfs_has_finobt(mp) && + !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT])) + return -EFSCORRUPTED; + + return 0; +} + +/* + * Reinitialize the AGI header, making an in-core copy of the old contents so + * that we know which in-core state needs to be reinitialized. + */ +STATIC void +xrep_agi_init_header( + struct xfs_scrub *sc, + struct xfs_buf *agi_bp, + struct xfs_agi *old_agi) +{ + struct xfs_agi *agi = agi_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + + memcpy(old_agi, agi, sizeof(*old_agi)); + memset(agi, 0, BBTOB(agi_bp->b_length)); + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(pag->pag_agno); + agi->agi_length = cpu_to_be32(pag->block_count); + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_has_crc(mp)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); + + /* We don't know how to fix the unlinked list yet. */ + memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked, + sizeof(agi->agi_unlinked)); + + /* Mark the incore AGF data stale until we're done fixing things. */ + ASSERT(xfs_perag_initialised_agi(pag)); + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); +} + +/* Set btree root information in an AGI. */ +STATIC void +xrep_agi_set_roots( + struct xfs_scrub *sc, + struct xfs_agi *agi, + struct xrep_find_ag_btree *fab) +{ + agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); + agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); + + if (xfs_has_finobt(sc->mp)) { + agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root); + agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height); + } +} + +/* Update the AGI counters. */ +STATIC int +xrep_agi_calc_from_btrees( + struct xfs_scrub *sc, + struct xfs_buf *agi_bp) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agi_bp->b_addr; + struct xfs_mount *mp = sc->mp; + xfs_agino_t count; + xfs_agino_t freecount; + int error; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); + error = xfs_ialloc_count_inodes(cur, &count, &freecount); + if (error) + goto err; + if (xfs_has_inobtcounts(mp)) { + xfs_agblock_t blocks; + + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + agi->agi_iblocks = cpu_to_be32(blocks); + } + xfs_btree_del_cursor(cur, error); + + agi->agi_count = cpu_to_be32(count); + agi->agi_freecount = cpu_to_be32(freecount); + + if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { + xfs_agblock_t blocks; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, + XFS_BTNUM_FINO); + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agi->agi_fblocks = cpu_to_be32(blocks); + } + + return 0; +err: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Trigger reinitialization of the in-core data. */ +STATIC int +xrep_agi_commit_new( + struct xfs_scrub *sc, + struct xfs_buf *agi_bp) +{ + struct xfs_perag *pag; + struct xfs_agi *agi = agi_bp->b_addr; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF); + xfs_trans_log_buf(sc->tp, agi_bp, 0, BBTOB(agi_bp->b_length) - 1); + + /* Now reinitialize the in-core counters if necessary. */ + pag = sc->sa.pag; + pag->pagi_count = be32_to_cpu(agi->agi_count); + pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + + return 0; +} + +/* Repair the AGI. */ +int +xrep_agi( + struct xfs_scrub *sc) +{ + struct xrep_find_ag_btree fab[XREP_AGI_MAX] = { + [XREP_AGI_INOBT] = { + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }, + [XREP_AGI_FINOBT] = { + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }, + [XREP_AGI_END] = { + .buf_ops = NULL + }, + }; + struct xfs_agi old_agi; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_agi *agi; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + /* + * Make sure we have the AGI buffer, as scrub might have decided it + * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. + */ + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); + if (error) + return error; + agi_bp->b_ops = &xfs_agi_buf_ops; + agi = agi_bp->b_addr; + + /* Find the AGI btree roots. */ + error = xrep_agi_find_btrees(sc, fab); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Start rewriting the header and implant the btrees we found. */ + xrep_agi_init_header(sc, agi_bp, &old_agi); + xrep_agi_set_roots(sc, agi, fab); + error = xrep_agi_calc_from_btrees(sc, agi_bp); + if (error) + goto out_revert; + + /* Reinitialize in-core state. */ + return xrep_agi_commit_new(sc, agi_bp); + +out_revert: + /* Mark the incore AGI state stale and revert the AGI. */ + clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); + memcpy(agi, &old_agi, sizeof(old_agi)); + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c new file mode 100644 index 0000000000..279af72b16 --- /dev/null +++ b/fs/xfs/scrub/alloc.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "xfs_ag.h" + +/* + * Set us up to scrub free space btrees. + */ +int +xchk_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + return xchk_setup_ag_btree(sc, false); +} + +/* Free space btree scrubber. */ + +struct xchk_alloc { + /* Previous free space extent. */ + struct xfs_alloc_rec_incore prev; +}; + +/* + * Ensure there's a corresponding cntbt/bnobt record matching this + * bnobt/cntbt record, respectively. + */ +STATIC void +xchk_allocbt_xref_other( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_btree_cur **pcur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int has_otherrec; + int error; + + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + pcur = &sc->sa.cnt_cur; + else + pcur = &sc->sa.bno_cur; + if (!*pcur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); + if (!xchk_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xchk_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + error = xfs_alloc_get_rec(*pcur, &fbno, &flen, &has_otherrec); + if (!xchk_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xchk_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + if (fbno != agbno || flen != len) + xchk_btree_xref_set_corrupt(sc, *pcur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_allocbt_xref( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *irec) +{ + xfs_agblock_t agbno = irec->ar_startblock; + xfs_extlen_t len = irec->ar_blockcount; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_allocbt_xref_other(sc, agbno, len); + xchk_xref_is_not_inode_chunk(sc, agbno, len); + xchk_xref_has_no_owner(sc, agbno, len); + xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_allocbt_mergeable( + struct xchk_btree *bs, + struct xchk_alloc *ca, + const struct xfs_alloc_rec_incore *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (ca->prev.ar_blockcount > 0 && + ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock && + ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&ca->prev, irec, sizeof(*irec)); +} + +/* Scrub a bnobt/cntbt record. */ +STATIC int +xchk_allocbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_alloc_rec_incore irec; + struct xchk_alloc *ca = bs->private; + + xfs_alloc_btrec_to_irec(rec, &irec); + if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + xchk_allocbt_mergeable(bs, ca, &irec); + xchk_allocbt_xref(bs->sc, &irec); + + return 0; +} + +/* Scrub the freespace btrees for some AG. */ +STATIC int +xchk_allocbt( + struct xfs_scrub *sc, + xfs_btnum_t which) +{ + struct xchk_alloc ca = { }; + struct xfs_btree_cur *cur; + + cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); +} + +int +xchk_bnobt( + struct xfs_scrub *sc) +{ + return xchk_allocbt(sc, XFS_BTNUM_BNO); +} + +int +xchk_cntbt( + struct xfs_scrub *sc) +{ + return xchk_allocbt(sc, XFS_BTNUM_CNT); +} + +/* xref check that the extent is not free */ +void +xchk_xref_is_used_space( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c new file mode 100644 index 0000000000..6c16d9530c --- /dev/null +++ b/fs/xfs/scrub/attr.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/dabtree.h" +#include "scrub/attr.h" + +/* Free the buffers linked from the xattr buffer. */ +static void +xchk_xattr_buf_cleanup( + void *priv) +{ + struct xchk_xattr_buf *ab = priv; + + kvfree(ab->freemap); + ab->freemap = NULL; + kvfree(ab->usedmap); + ab->usedmap = NULL; + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; +} + +/* + * Allocate the free space bitmap if we're trying harder; there are leaf blocks + * in the attr fork; or we can't tell if there are leaf blocks. + */ +static inline bool +xchk_xattr_want_freemap( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + if (sc->flags & XCHK_TRY_HARDER) + return true; + + if (!sc->ip) + return true; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (!ifp) + return false; + + return xfs_ifork_has_extents(ifp); +} + +/* + * Allocate enough memory to hold an attr value and attr block bitmaps, + * reallocating the buffer if necessary. Buffer contents are not preserved + * across a reallocation. + */ +static int +xchk_setup_xattr_buf( + struct xfs_scrub *sc, + size_t value_size) +{ + size_t bmp_sz; + struct xchk_xattr_buf *ab = sc->buf; + void *new_val; + + bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); + + if (ab) + goto resize_value; + + ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS); + if (!ab) + return -ENOMEM; + sc->buf = ab; + sc->buf_cleanup = xchk_xattr_buf_cleanup; + + ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->usedmap) + return -ENOMEM; + + if (xchk_xattr_want_freemap(sc)) { + ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->freemap) + return -ENOMEM; + } + +resize_value: + if (ab->value_sz >= value_size) + return 0; + + if (ab->value) { + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; + } + + new_val = kvmalloc(value_size, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + ab->value = new_val; + ab->value_sz = value_size; + return 0; +} + +/* Set us up to scrub an inode's extended attributes. */ +int +xchk_setup_xattr( + struct xfs_scrub *sc) +{ + int error; + + /* + * We failed to get memory while checking attrs, so this time try to + * get all the memory we're ever going to need. Allocate the buffer + * without the inode lock held, which means we can sleep. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX); + if (error) + return error; + } + + return xchk_setup_inode_contents(sc, 0); +} + +/* Extended Attributes */ + +struct xchk_xattr { + struct xfs_attr_list_context context; + struct xfs_scrub *sc; +}; + +/* + * Check that an extended attribute key can be looked up by hash. + * + * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) + * to call this function for every attribute key in an inode. Once + * we're here, we load the attribute value to see if any errors happen, + * or if we get more or less data than we expected. + */ +static void +xchk_xattr_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) +{ + struct xfs_da_args args = { + .op_flags = XFS_DA_OP_NOTIME, + .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = context->dp->i_mount->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = context->dp, + .name = name, + .namelen = namelen, + .hashval = xfs_da_hashname(name, namelen), + .trans = context->tp, + .valuelen = valuelen, + }; + struct xchk_xattr_buf *ab; + struct xchk_xattr *sx; + int error = 0; + + sx = container_of(context, struct xchk_xattr, context); + ab = sx->sc->buf; + + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = error; + return; + } + + if (flags & XFS_ATTR_INCOMPLETE) { + /* Incomplete attr key, just mark the inode for preening. */ + xchk_ino_set_preen(sx->sc, context->dp->i_ino); + return; + } + + /* Only one namespace bit allowed. */ + if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + + /* + * Local xattr values are stored in the attr leaf block, so we don't + * need to retrieve the value from a remote block to detect corruption + * problems. + */ + if (flags & XFS_ATTR_LOCAL) + goto fail_xref; + + /* + * Try to allocate enough memory to extrat the attr value. If that + * doesn't work, we overload the seen_enough variable to convey + * the error message back to the main scrub function. + */ + error = xchk_setup_xattr_buf(sx->sc, valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) { + context->seen_enough = error; + return; + } + + args.value = ab->value; + + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) + error = -EFSCORRUPTED; + if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + &error)) + goto fail_xref; + if (args.valuelen != valuelen) + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, + args.blkno); +fail_xref: + if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + context->seen_enough = 1; + return; +} + +/* + * Mark a range [start, start+len) in this map. Returns true if the + * region was free, and false if there's a conflict or a problem. + * + * Within a char, the lowest bit of the char represents the byte with + * the smallest address + */ +STATIC bool +xchk_xattr_set_map( + struct xfs_scrub *sc, + unsigned long *map, + unsigned int start, + unsigned int len) +{ + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + bool ret = true; + + if (start >= mapsize) + return false; + if (start + len > mapsize) { + len = mapsize - start; + ret = false; + } + + if (find_next_bit(map, mapsize, start) < start + len) + ret = false; + bitmap_set(map, start, len); + + return ret; +} + +/* + * Check the leaf freemap from the usage bitmap. Returns false if the + * attr freemap has problems or points to used space. + */ +STATIC bool +xchk_xattr_check_freemap( + struct xfs_scrub *sc, + struct xfs_attr3_icleaf_hdr *leafhdr) +{ + struct xchk_xattr_buf *ab = sc->buf; + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + int i; + + /* Construct bitmap of freemap contents. */ + bitmap_zero(ab->freemap, mapsize); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (!xchk_xattr_set_map(sc, ab->freemap, + leafhdr->freemap[i].base, + leafhdr->freemap[i].size)) + return false; + } + + /* Look for bits that are set in freemap and are marked in use. */ + return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize); +} + +/* + * Check this leaf entry's relations to everything else. + * Returns the number of bytes used for the name/value data. + */ +STATIC void +xchk_xattr_entry( + struct xchk_da_btree *ds, + int level, + char *buf_end, + struct xfs_attr_leafblock *leaf, + struct xfs_attr3_icleaf_hdr *leafhdr, + struct xfs_attr_leaf_entry *ent, + int idx, + unsigned int *usedbytes, + __u32 *last_hashval) +{ + struct xfs_mount *mp = ds->state->mp; + struct xchk_xattr_buf *ab = ds->sc->buf; + char *name_end; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + unsigned int nameidx; + unsigned int namesize; + + if (ent->pad2 != 0) + xchk_da_set_corrupt(ds, level); + + /* Hash values in order? */ + if (be32_to_cpu(ent->hashval) < *last_hashval) + xchk_da_set_corrupt(ds, level); + *last_hashval = be32_to_cpu(ent->hashval); + + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr->firstused || + nameidx >= mp->m_attr_geo->blksize) { + xchk_da_set_corrupt(ds, level); + return; + } + + /* Check the name information. */ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, idx); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, + be16_to_cpu(lentry->valuelen)); + name_end = (char *)lentry + namesize; + if (lentry->namelen == 0) + xchk_da_set_corrupt(ds, level); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, idx); + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + name_end = (char *)rentry + namesize; + if (rentry->namelen == 0 || rentry->valueblk == 0) + xchk_da_set_corrupt(ds, level); + } + if (name_end > buf_end) + xchk_da_set_corrupt(ds, level); + + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize)) + xchk_da_set_corrupt(ds, level); + if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + *usedbytes += namesize; +} + +/* Scrub an attribute leaf. */ +STATIC int +xchk_xattr_block( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_mount *mp = ds->state->mp; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_buf *bp = blk->bp; + xfs_dablk_t *last_checked = ds->private; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + struct xchk_xattr_buf *ab = ds->sc->buf; + char *buf_end; + size_t off; + __u32 last_hashval = 0; + unsigned int usedbytes = 0; + unsigned int hdrsize; + int i; + + if (*last_checked == blk->blkno) + return 0; + + *last_checked = blk->blkno; + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); + + /* Check all the padding. */ + if (xfs_has_crc(ds->sc->mp)) { + struct xfs_attr3_leafblock *leaf3 = bp->b_addr; + + if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 || + leaf3->hdr.info.hdr.pad != 0) + xchk_da_set_corrupt(ds, level); + } else { + if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) + xchk_da_set_corrupt(ds, level); + } + + /* Check the leaf header */ + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + + if (leafhdr.usedbytes > mp->m_attr_geo->blksize) + xchk_da_set_corrupt(ds, level); + if (leafhdr.firstused > mp->m_attr_geo->blksize) + xchk_da_set_corrupt(ds, level); + if (leafhdr.firstused < hdrsize) + xchk_da_set_corrupt(ds, level); + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) + xchk_da_set_corrupt(ds, level); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + entries = xfs_attr3_leaf_entryp(leaf); + if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused) + xchk_da_set_corrupt(ds, level); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + /* Mark the leaf entry itself. */ + off = (char *)ent - (char *)leaf; + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) { + xchk_da_set_corrupt(ds, level); + goto out; + } + + /* Check the entry and nameval. */ + xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr, + ent, i, &usedbytes, &last_hashval); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + } + + if (!xchk_xattr_check_freemap(ds->sc, &leafhdr)) + xchk_da_set_corrupt(ds, level); + + if (leafhdr.usedbytes != usedbytes) + xchk_da_set_corrupt(ds, level); + +out: + return 0; +} + +/* Scrub a attribute btree record. */ +STATIC int +xchk_xattr_rec( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_mount *mp = ds->state->mp; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_buf *bp; + struct xfs_attr_leaf_entry *ent; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + int nameidx; + int hdrsize; + unsigned int badflags; + int error; + + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index; + + /* Check the whole block, if necessary. */ + error = xchk_xattr_block(ds, level); + if (error) + goto out; + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check the hash of the entry. */ + error = xchk_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Find the attr entry's location. */ + bp = blk->bp; + hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr); + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) { + xchk_da_set_corrupt(ds, level); + goto out; + } + + /* Retrieve the entry and check it. */ + hash = be32_to_cpu(ent->hashval); + badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | + XFS_ATTR_INCOMPLETE); + if ((ent->flags & badflags) != 0) + xchk_da_set_corrupt(ds, level); + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = (struct xfs_attr_leaf_name_local *) + (((char *)bp->b_addr) + nameidx); + if (lentry->namelen <= 0) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + } else { + rentry = (struct xfs_attr_leaf_name_remote *) + (((char *)bp->b_addr) + nameidx); + if (rentry->namelen <= 0) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + } + if (calc_hash != hash) + xchk_da_set_corrupt(ds, level); + +out: + return error; +} + +/* Check space usage of shortform attrs. */ +STATIC int +xchk_xattr_check_sf( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + bitmap_zero(ab->usedmap, ifp->if_bytes); + sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; + end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + + sfe = &sf->list[0]; + if ((unsigned char *)sfe > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + for (i = 0; i < sf->hdr.count; i++) { + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)sf, + sizeof(struct xfs_attr_sf_entry))) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)name - (char *)sf, + sfe->namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)value - (char *)sf, + sfe->valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + sfe = next; + } + + return 0; +} + +/* Scrub the extended attribute metadata. */ +int +xchk_xattr( + struct xfs_scrub *sc) +{ + struct xchk_xattr sx = { + .sc = sc, + .context = { + .dp = sc->ip, + .tp = sc->tp, + .resynch = 1, + .put_listent = xchk_xattr_listent, + .allow_incomplete = true, + }, + }; + xfs_dablk_t last_checked = -1U; + int error = 0; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + /* Allocate memory for xattr checking. */ + error = xchk_setup_xattr_buf(sc, 0); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + + /* Check the physical structure of the xattr. */ + if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + error = xchk_xattr_check_sf(sc); + else + error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, + &last_checked); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Look up every xattr in this file by name and hash. + * + * Use the backend implementation of xfs_attr_list to call + * xchk_xattr_listent on every attribute key in this inode. + * In other words, we use the same iterator/callback mechanism + * that listattr uses to scrub extended attributes, though in our + * _listent function, we check the value of the attribute. + * + * The VFS only locks i_rwsem when modifying attrs, so keep all + * three locks held because that's the only way to ensure we're + * the only thread poking into the da btree. We traverse the da + * btree while holding a leaf buffer locked for the xattr name + * iteration, which doesn't really follow the usual buffer + * locking order. + */ + error = xfs_attr_list_ilocked(&sx.context); + if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* Did our listent function try to return any errors? */ + if (sx.context.seen_enough < 0) + return sx.context.seen_enough; + + return 0; +} diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h new file mode 100644 index 0000000000..48fd9402c4 --- /dev/null +++ b/fs/xfs/scrub/attr.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ATTR_H__ +#define __XFS_SCRUB_ATTR_H__ + +/* + * Temporary storage for online scrub and repair of extended attributes. + */ +struct xchk_xattr_buf { + /* Bitmap of used space in xattr leaf blocks and shortform forks. */ + unsigned long *usedmap; + + /* Bitmap of free space in xattr leaf blocks. */ + unsigned long *freemap; + + /* Memory buffer used to extract xattr values. */ + void *value; + size_t value_sz; +}; + +#endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c new file mode 100644 index 0000000000..e0c89a9a0c --- /dev/null +++ b/fs/xfs/scrub/bitmap.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "scrub/scrub.h" +#include "scrub/bitmap.h" + +#include <linux/interval_tree_generic.h> + +struct xbitmap_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint64_t bn_start; + + /* Last set bit of this interval. */ + uint64_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint64_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint64_t parameters. */ + +#define START(node) ((node)->bn_start) +#define LAST(node) ((node)->bn_last) + +/* + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. + */ +static inline void +xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline void +xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline struct xbitmap_node * +xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, + uint64_t last); + +static inline struct xbitmap_node * +xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, + uint64_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap_clear( + struct xbitmap *bitmap, + uint64_t start, + uint64_t len) +{ + struct xbitmap_node *bn; + struct xbitmap_node *new_bn; + uint64_t last = start + len - 1; + + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint64_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ +int +xbitmap_set( + struct xbitmap *bitmap, + uint64_t start, + uint64_t len) +{ + struct xbitmap_node *left; + struct xbitmap_node *right; + uint64_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap_destroy( + struct xbitmap *bitmap) +{ + struct xbitmap_node *bn; + + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap_init( + struct xbitmap *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; +} + +/* + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. + */ +int +xbitmap_disunion( + struct xbitmap *bitmap, + struct xbitmap *sub) +{ + struct xbitmap_node *bn; + int error; + + if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + return 0; + + for_each_xbitmap_extent(bn, sub) { + error = xbitmap_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; + } + + return 0; +} + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} + +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap_hweight( + struct xbitmap *bitmap) +{ + struct xbitmap_node *bn; + uint64_t ret = 0; + + for_each_xbitmap_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap_walk( + struct xbitmap *bitmap, + xbitmap_walk_fn fn, + void *priv) +{ + struct xbitmap_node *bn; + int error = 0; + + for_each_xbitmap_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap_empty( + struct xbitmap *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap_test( + struct xbitmap *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h new file mode 100644 index 0000000000..4fe58bad67 --- /dev/null +++ b/fs/xfs/scrub/bitmap.h @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_BITMAP_H__ +#define __XFS_SCRUB_BITMAP_H__ + +struct xbitmap { + struct rb_root_cached xb_root; +}; + +void xbitmap_init(struct xbitmap *bitmap); +void xbitmap_destroy(struct xbitmap *bitmap); + +int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); +uint64_t xbitmap_hweight(struct xbitmap *bitmap); + +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, + void *priv); + +bool xbitmap_empty(struct xbitmap *bitmap); +bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_set(&bitmap->agbitmap, start, len); +} + +static inline bool +xagb_bitmap_test( + struct xagb_bitmap *bitmap, + xfs_agblock_t start, + xfs_extlen_t *len) +{ + uint64_t biglen = *len; + bool ret; + + ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); + + if (start + biglen >= UINT_MAX) { + ASSERT(0); + biglen = UINT_MAX - start; + } + + *len = biglen; + return ret; +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap_walk_fn fn, void *priv) +{ + return xbitmap_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +#endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c new file mode 100644 index 0000000000..7558891557 --- /dev/null +++ b/fs/xfs/scrub/bmap.c @@ -0,0 +1,959 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "xfs_ag.h" + +/* Set us up with an inode's bmap. */ +int +xchk_setup_inode_bmap( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + error = xchk_iget_for_scrubbing(sc); + if (error) + goto out; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * We don't want any ephemeral data/cow fork updates sitting around + * while we inspect block mappings, so wait for directio to finish + * and flush dirty data if we have delalloc reservations. + */ + if (S_ISREG(VFS_I(sc->ip)->i_mode) && + sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + + xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + + inode_dio_wait(VFS_I(sc->ip)); + + /* + * Try to flush all incore state to disk before we examine the + * space mappings for the data fork. Leave accumulated errors + * in the mapping for the writer threads to consume. + * + * On ENOSPC or EIO writeback errors, we continue into the + * extent mapping checks because write failures do not + * necessarily imply anything about the correctness of the file + * metadata. The metadata and the file data could be on + * completely separate devices; a media failure might only + * affect a subset of the disk, etc. We can handle delalloc + * extents in the scrubber, so leaving them in memory is fine. + */ + error = filemap_fdatawrite(mapping); + if (!error) + error = filemap_fdatawait_keep_errors(mapping); + if (error && (error != -ENOSPC && error != -EIO)) + goto out; + } + + /* Got the inode, lock it and we're ready to go. */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out; + + xchk_ilock(sc, XFS_ILOCK_EXCL); +out: + /* scrub teardown will unlock and release the inode */ + return error; +} + +/* + * Inode fork block mapping (BMBT) scrubber. + * More complex than the others because we have to scrub + * all the extents regardless of whether or not the fork + * is in btree format. + */ + +struct xchk_bmap_info { + struct xfs_scrub *sc; + + /* Incore extent tree cursor */ + struct xfs_iext_cursor icur; + + /* Previous fork mapping that we examined */ + struct xfs_bmbt_irec prev_rec; + + /* Is this a realtime fork? */ + bool is_rt; + + /* May mappings point to shared space? */ + bool is_shared; + + /* Was the incore extent tree loaded? */ + bool was_loaded; + + /* Which inode fork are we checking? */ + int whichfork; +}; + +/* Look for a corresponding rmap for this irec. */ +static inline bool +xchk_bmap_get_rmap( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno, + uint64_t owner, + struct xfs_rmap_irec *rmap) +{ + xfs_fileoff_t offset; + unsigned int rflags = 0; + int has_rmap; + int error; + + if (info->whichfork == XFS_ATTR_FORK) + rflags |= XFS_RMAP_ATTR_FORK; + if (irec->br_state == XFS_EXT_UNWRITTEN) + rflags |= XFS_RMAP_UNWRITTEN; + + /* + * CoW staging extents are owned (on disk) by the refcountbt, so + * their rmaps do not have offsets. + */ + if (info->whichfork == XFS_COW_FORK) + offset = 0; + else + offset = irec->br_startoff; + + /* + * If the caller thinks this could be a shared bmbt extent (IOWs, + * any data fork extent of a reflink inode) then we have to use the + * range rmap lookup to make sure we get the correct owner/offset. + */ + if (info->is_shared) { + error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); + } else { + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); + } + if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) + return false; + + if (!has_rmap) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return has_rmap; +} + +/* Make sure that we have rmapbt records for this data/attr fork extent. */ +STATIC void +xchk_bmap_xref_rmap( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner = info->sc->ip->i_ino; + + if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + return; + + /* Find the rmap record for this irec. */ + if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; + + /* + * The rmap must be an exact match for this incore file mapping record, + * which may have arisen from multiple ondisk records. + */ + if (rmap.rm_startblock != agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap_end != agbno + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the logical offsets. */ + if (rmap.rm_offset != irec->br_startoff) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount; + if (rmap_end != irec->br_startoff + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ + if (rmap.rm_owner != owner) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check for discrepancies between the unwritten flag in the irec and + * the rmap. Note that the (in-memory) CoW fork distinguishes between + * unwritten and written extents, but we don't track that in the rmap + * records because the blocks are owned (on-disk) by the refcountbt, + * which doesn't track unwritten state. + */ + if (!!(irec->br_state == XFS_EXT_UNWRITTEN) != + !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!!(info->whichfork == XFS_ATTR_FORK) != + !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + +/* Make sure that we have rmapbt records for this COW fork extent. */ +STATIC void +xchk_bmap_xref_rmap_cow( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner = XFS_RMAP_OWN_COW; + + if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + return; + + /* Find the rmap record for this irec. */ + if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; + + /* + * CoW staging extents are owned by the refcount btree, so the rmap + * can start before and end after the physical space allocated to this + * mapping. There are no offsets to check. + */ + if (rmap.rm_startblock > agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap_end < agbno + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ + if (rmap.rm_owner != owner) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * No flags allowed. Note that the (in-memory) CoW fork distinguishes + * between unwritten and written extents, but we don't track that in + * the rmap records because the blocks are owned (on-disk) by the + * refcountbt, which doesn't track unwritten state. + */ + if (rmap.rm_flags & XFS_RMAP_ATTR_FORK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_UNWRITTEN) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + +/* Cross-reference a single rtdev extent record. */ +STATIC void +xchk_bmap_rt_iextent_xref( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, + irec->br_blockcount); +} + +/* Cross-reference a single datadev extent record. */ +STATIC void +xchk_bmap_iextent_xref( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = info->sc->mp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t len; + int error; + + agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); + len = irec->br_blockcount; + + error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + goto out_free; + + xchk_xref_is_used_space(info->sc, agbno, len); + xchk_xref_is_not_inode_chunk(info->sc, agbno, len); + switch (info->whichfork) { + case XFS_DATA_FORK: + xchk_bmap_xref_rmap(info, irec, agbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + } + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; + case XFS_ATTR_FORK: + xchk_bmap_xref_rmap(info, irec, agbno); + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &oinfo); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, agbno); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &XFS_RMAP_OINFO_COW); + xchk_xref_is_cow_staging(info->sc, agbno, + irec->br_blockcount); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + break; + } + +out_free: + xchk_ag_free(info->sc, &info->sc->sa); +} + +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + +/* Scrub a single extent record. */ +STATIC void +xchk_bmap_iextent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + xchk_bmap_dirattr_extent(ip, info, irec); + + /* Make sure the extent points to a valid place. */ + if (info->is_rt && + !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (!info->is_rt && + !xfs_verify_fsbext(mp, irec->br_startblock, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* We don't allow unwritten extents on attr forks. */ + if (irec->br_state == XFS_EXT_UNWRITTEN && + info->whichfork == XFS_ATTR_FORK) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (info->is_rt) + xchk_bmap_rt_iextent_xref(ip, info, irec); + else + xchk_bmap_iextent_xref(ip, info, irec); +} + +/* Scrub a bmbt record. */ +STATIC int +xchk_bmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec iext_irec; + struct xfs_iext_cursor icur; + struct xchk_bmap_info *info = bs->private; + struct xfs_inode *ip = bs->cur->bc_ino.ip; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork); + uint64_t owner; + int i; + + /* + * Check the owners of the btree blocks up to the level below + * the root since the verifiers don't do that. + */ + if (xfs_has_crc(bs->cur->bc_mp) && + bs->cur->bc_levels[0].ptr == 1) { + for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { + block = xfs_btree_get_block(bs->cur, i, &bp); + owner = be64_to_cpu(block->bb_u.l.bb_owner); + if (owner != ip->i_ino) + xchk_fblock_set_corrupt(bs->sc, + info->whichfork, 0); + } + } + + /* + * Check that the incore extent tree contains an extent that matches + * this one exactly. We validate those cached bmaps later, so we don't + * need to check them here. If the incore extent tree was just loaded + * from disk by the scrubber, we assume that its contents match what's + * on disk (we still hold the ILOCK) and skip the equivalence check. + */ + if (!info->was_loaded) + return 0; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) { + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; + } + + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, + &iext_irec) || + irec.br_startoff != iext_irec.br_startoff || + irec.br_startblock != iext_irec.br_startblock || + irec.br_blockcount != iext_irec.br_blockcount || + irec.br_state != iext_irec.br_state) + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; +} + +/* Scan the btree records. */ +STATIC int +xchk_bmap_btree( + struct xfs_scrub *sc, + int whichfork, + struct xchk_bmap_info *info) +{ + struct xfs_owner_info oinfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_btree_cur *cur; + int error; + + /* Load the incore bmap cache if it's not loaded. */ + info->was_loaded = !xfs_need_iread_extents(ifp); + + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + + /* Check the btree structure. */ + cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); + error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); + xfs_btree_del_cursor(cur, error); +out: + return error; +} + +struct xchk_bmap_check_rmap_info { + struct xfs_scrub *sc; + int whichfork; + struct xfs_iext_cursor icur; +}; + +/* Can we find bmaps that fit this rmap? */ +STATIC int +xchk_bmap_check_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_bmbt_irec irec; + struct xfs_rmap_irec check_rec; + struct xchk_bmap_check_rmap_info *sbcri = priv; + struct xfs_ifork *ifp; + struct xfs_scrub *sc = sbcri->sc; + bool have_map; + + /* Is this even the right fork? */ + if (rec->rm_owner != sc->ip->i_ino) + return 0; + if ((sbcri->whichfork == XFS_ATTR_FORK) ^ + !!(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + return 0; + + /* Now look up the bmbt record. */ + ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork); + if (!ifp) { + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + goto out; + } + have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset, + &sbcri->icur, &irec); + if (!have_map) + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + /* + * bmap extent record lengths are constrained to 2^21 blocks in length + * because of space constraints in the on-disk metadata structure. + * However, rmap extent record lengths are constrained only by AG + * length, so we have to loop through the bmbt to make sure that the + * entire rmap is covered by bmbt records. + */ + check_rec = *rec; + while (have_map) { + if (irec.br_startoff != check_rec.rm_offset) + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + check_rec.rm_offset); + if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, + cur->bc_ag.pag->pag_agno, + check_rec.rm_startblock)) + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + check_rec.rm_offset); + if (irec.br_blockcount > check_rec.rm_blockcount) + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + check_rec.rm_offset); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + check_rec.rm_startblock += irec.br_blockcount; + check_rec.rm_offset += irec.br_blockcount; + check_rec.rm_blockcount -= irec.br_blockcount; + if (check_rec.rm_blockcount == 0) + break; + have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec); + if (!have_map) + xchk_fblock_set_corrupt(sc, sbcri->whichfork, + check_rec.rm_offset); + } + +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return 0; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_ag_rmaps( + struct xfs_scrub *sc, + int whichfork, + struct xfs_perag *pag) +{ + struct xchk_bmap_check_rmap_info sbcri; + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag); + + sbcri.sc = sc; + sbcri.whichfork = whichfork; + error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); + if (error == -ECANCELED) + error = 0; + + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(sc->tp, agf); + return error; +} + +/* + * Decide if we want to walk every rmap btree in the fs to make sure that each + * rmap for this file fork has corresponding bmbt entries. + */ +static bool +xchk_bmap_want_check_rmaps( + struct xchk_bmap_info *info) +{ + struct xfs_scrub *sc = info->sc; + struct xfs_ifork *ifp; + + if (!xfs_has_rmapbt(sc->mp)) + return false; + if (info->whichfork == XFS_COW_FORK) + return false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; + + /* Don't support realtime rmap checks yet. */ + if (info->is_rt) + return false; + + /* + * The inode repair code zaps broken inode forks by resetting them back + * to EXTENTS format and zero extent records. If we encounter a fork + * in this state along with evidence that the fork isn't supposed to be + * empty, we need to scan the reverse mappings to decide if we're going + * to rebuild the fork. Data forks with nonzero file size are scanned. + * xattr forks are never empty of content, so they are always scanned. + */ + ifp = xfs_ifork_ptr(sc->ip, info->whichfork); + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { + if (info->whichfork == XFS_DATA_FORK && + i_size_read(VFS_I(sc->ip)) == 0) + return false; + + return true; + } + + return false; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rmaps( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + for_each_perag(sc->mp, agno, pag) { + error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_perag_rele(pag); + return error; + } + } + + return 0; +} + +/* Scrub a delalloc reservation from the incore extent map tree. */ +STATIC void +xchk_bmap_iextent_delalloc( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + +/* Decide if this individual fork mapping is ok. */ +static bool +xchk_bmap_iext_mapping( + struct xchk_bmap_info *info, + const struct xfs_bmbt_irec *irec) +{ + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + return false; + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + return false; + return true; +} + +/* Are these two mappings contiguous with each other? */ +static inline bool +xchk_are_bmaps_contiguous( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't try to combine unallocated mappings. */ + if (!xfs_bmap_is_real_extent(b1)) + return false; + if (!xfs_bmap_is_real_extent(b2)) + return false; + + /* Does b2 come right after b1 in the logical and physical range? */ + if (b1->br_startoff + b1->br_blockcount != b2->br_startoff) + return false; + if (b1->br_startblock + b1->br_blockcount != b2->br_startblock) + return false; + if (b1->br_state != b2->br_state) + return false; + return true; +} + +/* + * Walk the incore extent records, accumulating consecutive contiguous records + * into a single incore mapping. Returns true if @irec has been set to a + * mapping or false if there are no more mappings. Caller must ensure that + * @info.icur is zeroed before the first call. + */ +static bool +xchk_bmap_iext_iter( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + unsigned int nr = 0; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + /* Advance to the next iextent record and check the mapping. */ + xfs_iext_next(ifp, &info->icur); + if (!xfs_iext_get_extent(ifp, &info->icur, irec)) + return false; + + if (!xchk_bmap_iext_mapping(info, irec)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return false; + } + nr++; + + /* + * Iterate subsequent iextent records and merge them with the one + * that we just read, if possible. + */ + while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { + if (!xchk_are_bmaps_contiguous(irec, &got)) + break; + + if (!xchk_bmap_iext_mapping(info, &got)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + got.br_startoff); + return false; + } + nr++; + + irec->br_blockcount += got.br_blockcount; + xfs_iext_next(ifp, &info->icur); + } + + /* + * If the merged mapping could be expressed with fewer bmbt records + * than we actually found, notify the user that this fork could be + * optimized. CoW forks only exist in memory so we ignore them. + */ + if (nr > 1 && info->whichfork != XFS_COW_FORK && + howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + + return true; +} + +/* + * Scrub an inode fork's block mappings. + * + * First we scan every record in every btree block, if applicable. + * Then we unconditionally scan the incore extent cache. + */ +STATIC int +xchk_bmap( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_bmbt_irec irec; + struct xchk_bmap_info info = { NULL }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_fileoff_t endoff; + int error = 0; + + /* Non-existent forks can be ignored. */ + if (!ifp) + return -ENOENT; + + info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.whichfork = whichfork; + info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); + info.sc = sc; + + switch (whichfork) { + case XFS_COW_FORK: + /* No CoW forks on non-reflink filesystems. */ + if (!xfs_has_reflink(mp)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + break; + case XFS_ATTR_FORK: + if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + break; + default: + ASSERT(whichfork == XFS_DATA_FORK); + break; + } + + /* Check the fork values */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_UUID: + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + /* No mappings to check. */ + if (whichfork == XFS_COW_FORK) + xchk_fblock_set_corrupt(sc, whichfork, 0); + return 0; + case XFS_DINODE_FMT_EXTENTS: + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_COW_FORK) { + xchk_fblock_set_corrupt(sc, whichfork, 0); + return 0; + } + + error = xchk_bmap_btree(sc, whichfork, &info); + if (error) + return error; + break; + default: + xchk_fblock_set_corrupt(sc, whichfork, 0); + return 0; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Find the offset of the last extent in the mapping. */ + error = xfs_bmap_last_offset(ip, &endoff, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + return error; + + /* + * Scrub extent records. We use a special iterator function here that + * combines adjacent mappings if they are logically and physically + * contiguous. For large allocations that require multiple bmbt + * records, this reduces the number of cross-referencing calls, which + * reduces runtime. Cross referencing with the rmap is simpler because + * the rmap must match the combined mapping exactly. + */ + while (xchk_bmap_iext_iter(&info, &irec)) { + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return 0; + + if (irec.br_startoff >= endoff) { + xchk_fblock_set_corrupt(sc, whichfork, + irec.br_startoff); + return 0; + } + + if (isnullstartblock(irec.br_startblock)) + xchk_bmap_iextent_delalloc(ip, &info, &irec); + else + xchk_bmap_iextent(ip, &info, &irec); + memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec)); + } + + if (xchk_bmap_want_check_rmaps(&info)) { + error = xchk_bmap_check_rmaps(sc, whichfork); + if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) + return error; + } + + return 0; +} + +/* Scrub an inode's data fork. */ +int +xchk_bmap_data( + struct xfs_scrub *sc) +{ + return xchk_bmap(sc, XFS_DATA_FORK); +} + +/* Scrub an inode's attr fork. */ +int +xchk_bmap_attr( + struct xfs_scrub *sc) +{ + return xchk_bmap(sc, XFS_ATTR_FORK); +} + +/* Scrub an inode's CoW fork. */ +int +xchk_bmap_cow( + struct xfs_scrub *sc) +{ + return xchk_bmap(sc, XFS_COW_FORK); +} diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c new file mode 100644 index 0000000000..1935b9ce18 --- /dev/null +++ b/fs/xfs/scrub/btree.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* btree scrubbing */ + +/* + * Check for btree operation errors. See the section about handling + * operational errors in common.c. + */ +static bool +__xchk_btree_process_error( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level, + int *error, + __u32 errflag, + void *ret_ip) +{ + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + case -ECHRNG: + /* Used to restart an op with deadlock avoidance. */ + trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= errflag; + *error = 0; + fallthrough; + default: + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xchk_ifork_btree_op_error(sc, cur, level, + *error, ret_ip); + else + trace_xchk_btree_op_error(sc, cur, level, + *error, ret_ip); + break; + } + return false; +} + +bool +xchk_btree_process_error( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xchk_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xchk_btree_xref_process_error( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xchk_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + +/* Record btree block corruption. */ +static void +__xchk_btree_set_corrupt( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level, + __u32 errflag, + void *ret_ip) +{ + sc->sm->sm_flags |= errflag; + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xchk_ifork_btree_error(sc, cur, level, + ret_ip); + else + trace_xchk_btree_error(sc, cur, level, + ret_ip); +} + +void +xchk_btree_set_corrupt( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT, + __return_address); +} + +void +xchk_btree_xref_set_corrupt( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT, + __return_address); +} + +void +xchk_btree_set_preen( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN, + __return_address); +} + +/* + * Make sure this record is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xchk_btree_rec( + struct xchk_btree *bs) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_rec *rec; + union xfs_btree_key key; + union xfs_btree_key hkey; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block); + + trace_xchk_btree_rec(bs->sc, cur, 0); + + /* Are all records across all record blocks in order? */ + if (bs->lastrec_valid && + !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) + xchk_btree_set_corrupt(bs->sc, cur, 0); + memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + bs->lastrec_valid = true; + + if (cur->bc_nlevels == 1) + return; + + /* Is low_key(rec) at least as large as the parent low key? */ + cur->bc_ops->init_key_from_rec(&key, rec); + keyblock = xfs_btree_get_block(cur, 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock); + if (xfs_btree_keycmp_lt(cur, &key, keyp)) + xchk_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is high_key(rec) no larger than the parent high key? */ + cur->bc_ops->init_high_key_from_rec(&hkey, rec); + keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock); + if (xfs_btree_keycmp_lt(cur, keyp, &hkey)) + xchk_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Make sure this key is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xchk_btree_key( + struct xchk_btree *bs, + int level) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *key; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, level, &bp); + key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); + + trace_xchk_btree_key(bs->sc, cur, level); + + /* Are all low keys across all node blocks in order? */ + if (bs->lastkey[level - 1].valid && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key)) + xchk_btree_set_corrupt(bs->sc, cur, level); + memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len); + bs->lastkey[level - 1].valid = true; + + if (level + 1 >= cur->bc_nlevels) + return; + + /* Is this block's low key at least as large as the parent low key? */ + keyblock = xfs_btree_get_block(cur, level + 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); + if (xfs_btree_keycmp_lt(cur, key, keyp)) + xchk_btree_set_corrupt(bs->sc, cur, level); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is this block's high key no larger than the parent high key? */ + key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); + keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, + keyblock); + if (xfs_btree_keycmp_lt(cur, keyp, key)) + xchk_btree_set_corrupt(bs->sc, cur, level); +} + +/* + * Check a btree pointer. Returns true if it's ok to use this pointer. + * Callers do not need to set the corrupt flag. + */ +static bool +xchk_btree_ptr_ok( + struct xchk_btree *bs, + int level, + union xfs_btree_ptr *ptr) +{ + bool res; + + /* A btree rooted in an inode has no block pointer to the root. */ + if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == bs->cur->bc_nlevels) + return true; + + /* Otherwise, check the pointers. */ + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); + else + res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); + if (!res) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + + return res; +} + +/* Check that a btree block's sibling matches what we expect it. */ +STATIC int +xchk_btree_block_check_sibling( + struct xchk_btree *bs, + int level, + int direction, + union xfs_btree_ptr *sibling) +{ + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *pblock; + struct xfs_buf *pbp; + struct xfs_btree_cur *ncur = NULL; + union xfs_btree_ptr *pp; + int success; + int error; + + error = xfs_btree_dup_cursor(cur, &ncur); + if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) || + !ncur) + return error; + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (xfs_btree_ptr_is_null(cur, sibling)) { + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (error == 0 && success) + xchk_btree_set_corrupt(bs->sc, cur, level); + error = 0; + goto out; + } + + /* Increment upper level pointer. */ + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error)) + goto out; + if (!success) { + xchk_btree_set_corrupt(bs->sc, cur, level + 1); + goto out; + } + + /* Compare upper level pointer to sibling pointer. */ + pblock = xfs_btree_get_block(ncur, level + 1, &pbp); + pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock); + if (!xchk_btree_ptr_ok(bs, level + 1, pp)) + goto out; + if (pbp) + xchk_buffer_recheck(bs->sc, pbp); + + if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) + xchk_btree_set_corrupt(bs->sc, cur, level); +out: + xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR); + return error; +} + +/* Check the siblings of a btree block. */ +STATIC int +xchk_btree_block_check_siblings( + struct xchk_btree *bs, + struct xfs_btree_block *block) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_ptr leftsib; + union xfs_btree_ptr rightsib; + int level; + int error = 0; + + xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB); + xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB); + level = xfs_btree_get_level(block); + + /* Root block should never have siblings. */ + if (level == cur->bc_nlevels - 1) { + if (!xfs_btree_ptr_is_null(cur, &leftsib) || + !xfs_btree_ptr_is_null(cur, &rightsib)) + xchk_btree_set_corrupt(bs->sc, cur, level); + goto out; + } + + /* + * Does the left & right sibling pointers match the adjacent + * parent level pointers? + * (These function absorbs error codes for us.) + */ + error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib); + if (error) + return error; + error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib); + if (error) + return error; +out: + return error; +} + +struct check_owner { + struct list_head list; + xfs_daddr_t daddr; + int level; +}; + +/* + * Make sure this btree block isn't in the free list and that there's + * an rmap record for it. + */ +STATIC int +xchk_btree_check_block_owner( + struct xchk_btree *bs, + int level, + xfs_daddr_t daddr) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_btnum_t btnum; + bool init_sa; + int error = 0; + + if (!bs->cur) + return 0; + + btnum = bs->cur->bc_btnum; + agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); + agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); + + init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + if (init_sa) { + error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, + level, &error)) + goto out_free; + } + + xchk_xref_is_used_space(bs->sc, agbno, 1); + /* + * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we + * have to nullify it (to shut down further block owner checks) if + * self-xref encounters problems. + */ + if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + bs->cur = NULL; + + xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); + if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + bs->cur = NULL; + +out_free: + if (init_sa) + xchk_ag_free(bs->sc, &bs->sc->sa); + + return error; +} + +/* Check the owner of a btree block. */ +STATIC int +xchk_btree_check_owner( + struct xchk_btree *bs, + int level, + struct xfs_buf *bp) +{ + struct xfs_btree_cur *cur = bs->cur; + + /* + * In theory, xfs_btree_get_block should only give us a null buffer + * pointer for the root of a root-in-inode btree type, but we need + * to check defensively here in case the cursor state is also screwed + * up. + */ + if (bp == NULL) { + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return 0; + } + + /* + * We want to cross-reference each btree block with the bnobt + * and the rmapbt. We cannot cross-reference the bnobt or + * rmapbt while scanning the bnobt or rmapbt, respectively, + * because we cannot alter the cursor and we'd prefer not to + * duplicate cursors. Therefore, save the buffer daddr for + * later scanning. + */ + if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + struct check_owner *co; + + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); + if (!co) + return -ENOMEM; + + INIT_LIST_HEAD(&co->list); + co->level = level; + co->daddr = xfs_buf_daddr(bp); + list_add_tail(&co->list, &bs->to_check); + return 0; + } + + return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp)); +} + +/* Decide if we want to check minrecs of a btree block in the inode root. */ +static inline bool +xchk_btree_check_iroot_minrecs( + struct xchk_btree *bs) +{ + /* + * xfs_bmap_add_attrfork_btree had an implementation bug wherein it + * would miscalculate the space required for the data fork bmbt root + * when adding an attr fork, and promote the iroot contents to an + * external block unnecessarily. This went unnoticed for many years + * until scrub found filesystems in this state. Inode rooted btrees are + * not supposed to have immediate child blocks that are small enough + * that the contents could fit in the inode root, but we can't fail + * existing filesystems, so instead we disable the check for data fork + * bmap btrees when there's an attr fork. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + bs->cur->bc_ino.whichfork == XFS_DATA_FORK && + xfs_inode_has_attr_fork(bs->sc->ip)) + return false; + + return true; +} + +/* + * Check that this btree block has at least minrecs records or is one of the + * special blocks that don't require that. + */ +STATIC void +xchk_btree_check_minrecs( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + struct xfs_btree_cur *cur = bs->cur; + unsigned int root_level = cur->bc_nlevels - 1; + unsigned int numrecs = be16_to_cpu(block->bb_numrecs); + + /* More records than minrecs means the block is ok. */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) + return; + + /* + * For btrees rooted in the inode, it's possible that the root block + * contents spilled into a regular ondisk block because there wasn't + * enough space in the inode root. The number of records in that + * child block might be less than the standard minrecs, but that's ok + * provided that there's only one direct child of the root. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 2) { + struct xfs_btree_block *root_block; + struct xfs_buf *root_bp; + int root_maxrecs; + + root_block = xfs_btree_get_block(cur, root_level, &root_bp); + root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level); + if (xchk_btree_check_iroot_minrecs(bs) && + (be16_to_cpu(root_block->bb_numrecs) != 1 || + numrecs <= root_maxrecs)) + xchk_btree_set_corrupt(bs->sc, cur, level); + return; + } + + /* + * Otherwise, only the root level is allowed to have fewer than minrecs + * records or keyptrs. + */ + if (level < root_level) + xchk_btree_set_corrupt(bs->sc, cur, level); +} + +/* + * If this btree block has a parent, make sure that the parent's keys capture + * the keyspace contained in this block. + */ +STATIC void +xchk_btree_block_check_keys( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_key; + union xfs_btree_key *block_high_key; + union xfs_btree_key *parent_low_key, *parent_high_key; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level == cur->bc_nlevels - 1) + return; + + xfs_btree_get_keys(cur, block, &block_key); + + /* Make sure the low key of this block matches the parent. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return; + } + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Make sure the high key of this block matches the parent. */ + parent_high_key = xfs_btree_high_key_addr(cur, + cur->bc_levels[level + 1].ptr, parent_block); + block_high_key = xfs_btree_high_key_from_key(cur, &block_key); + if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* + * Grab and scrub a btree block given a btree pointer. Returns block + * and buffer pointers (if applicable) if they're ok to use. + */ +STATIC int +xchk_btree_get_block( + struct xchk_btree *bs, + int level, + union xfs_btree_ptr *pp, + struct xfs_btree_block **pblock, + struct xfs_buf **pbp) +{ + xfs_failaddr_t failed_at; + int error; + + *pblock = NULL; + *pbp = NULL; + + error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock); + if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) || + !*pblock) + return error; + + xfs_btree_get_block(bs->cur, level, pbp); + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, + level, *pbp); + else + failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, + level, *pbp); + if (failed_at) { + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return 0; + } + if (*pbp) + xchk_buffer_recheck(bs->sc, *pbp); + + xchk_btree_check_minrecs(bs, level, *pblock); + + /* + * Check the block's owner; this function absorbs error codes + * for us. + */ + error = xchk_btree_check_owner(bs, level, *pbp); + if (error) + return error; + + /* + * Check the block's siblings; this function absorbs error codes + * for us. + */ + error = xchk_btree_block_check_siblings(bs, *pblock); + if (error) + return error; + + xchk_btree_block_check_keys(bs, level, *pblock); + return 0; +} + +/* + * Check that the low and high keys of this block match the keys stored + * in the parent block. + */ +STATIC void +xchk_btree_block_keys( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_keys; + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *high_bk; + union xfs_btree_key *parent_keys; + union xfs_btree_key *high_pk; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level >= cur->bc_nlevels - 1) + return; + + /* Calculate the keys for this block. */ + xfs_btree_get_keys(cur, block, &block_keys); + + /* Obtain the parent's copy of the keys for this block. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + + if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) + xchk_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Get high keys */ + high_bk = xfs_btree_high_key_from_key(cur, &block_keys); + high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + + if (xfs_btree_keycmp_ne(cur, high_bk, high_pk)) + xchk_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Visit all nodes and leaves of a btree. Check that all pointers and + * records are in order, that the keys reflect the records, and use a callback + * so that the caller can verify individual records. + */ +int +xchk_btree( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + xchk_btree_rec_fn scrub_fn, + const struct xfs_owner_info *oinfo, + void *private) +{ + union xfs_btree_ptr ptr; + struct xchk_btree *bs; + union xfs_btree_ptr *pp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + struct xfs_buf *bp; + struct check_owner *co; + struct check_owner *n; + size_t cur_sz; + int level; + int error = 0; + + /* + * Allocate the btree scrub context from the heap, because this + * structure can get rather large. Don't let a caller feed us a + * totally absurd size. + */ + cur_sz = xchk_btree_sizeof(cur->bc_nlevels); + if (cur_sz > PAGE_SIZE) { + xchk_btree_set_corrupt(sc, cur, 0); + return 0; + } + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); + if (!bs) + return -ENOMEM; + bs->cur = cur; + bs->scrub_rec = scrub_fn; + bs->oinfo = oinfo; + bs->private = private; + bs->sc = sc; + + /* Initialize scrub state */ + INIT_LIST_HEAD(&bs->to_check); + + /* + * Load the root of the btree. The helper function absorbs + * error codes for us. + */ + level = cur->bc_nlevels - 1; + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) + goto out; + error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); + if (error || !block) + goto out; + + cur->bc_levels[level].ptr = 1; + + while (level < cur->bc_nlevels) { + block = xfs_btree_get_block(cur, level, &bp); + + if (level == 0) { + /* End of leaf, pop back towards the root. */ + if (cur->bc_levels[level].ptr > + be16_to_cpu(block->bb_numrecs)) { + xchk_btree_block_keys(bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_levels[level + 1].ptr++; + level++; + continue; + } + + /* Records in order for scrub? */ + xchk_btree_rec(bs); + + /* Call out to the record checker. */ + recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, + block); + error = bs->scrub_rec(bs, recp); + if (error) + break; + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + cur->bc_levels[level].ptr++; + continue; + } + + /* End of node, pop back towards the root. */ + if (cur->bc_levels[level].ptr > + be16_to_cpu(block->bb_numrecs)) { + xchk_btree_block_keys(bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_levels[level + 1].ptr++; + level++; + continue; + } + + /* Keys in order for scrub? */ + xchk_btree_key(bs, level); + + /* Drill another level deeper. */ + pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); + if (!xchk_btree_ptr_ok(bs, level, pp)) { + cur->bc_levels[level].ptr++; + continue; + } + level--; + error = xchk_btree_get_block(bs, level, pp, &block, &bp); + if (error || !block) + goto out; + + cur->bc_levels[level].ptr = 1; + } + +out: + /* Process deferred owner checks on btree blocks. */ + list_for_each_entry_safe(co, n, &bs->to_check, list) { + if (!error && bs->cur) + error = xchk_btree_check_block_owner(bs, co->level, + co->daddr); + list_del(&co->list); + kfree(co); + } + kfree(bs); + + return error; +} diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h new file mode 100644 index 0000000000..c32b5fad61 --- /dev/null +++ b/fs/xfs/scrub/btree.h @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_BTREE_H__ +#define __XFS_SCRUB_BTREE_H__ + +/* btree scrub */ + +/* Check for btree operation errors. */ +bool xchk_btree_process_error(struct xfs_scrub *sc, + struct xfs_btree_cur *cur, int level, int *error); + +/* Check for btree xref operation errors. */ +bool xchk_btree_xref_process_error(struct xfs_scrub *sc, + struct xfs_btree_cur *cur, int level, int *error); + +/* Check for btree corruption. */ +void xchk_btree_set_corrupt(struct xfs_scrub *sc, + struct xfs_btree_cur *cur, int level); +void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level); + +/* Check for btree xref discrepancies. */ +void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, + struct xfs_btree_cur *cur, int level); + +struct xchk_btree; +typedef int (*xchk_btree_rec_fn)( + struct xchk_btree *bs, + const union xfs_btree_rec *rec); + +struct xchk_btree_key { + union xfs_btree_key key; + bool valid; +}; + +struct xchk_btree { + /* caller-provided scrub state */ + struct xfs_scrub *sc; + struct xfs_btree_cur *cur; + xchk_btree_rec_fn scrub_rec; + const struct xfs_owner_info *oinfo; + void *private; + + /* internal scrub state */ + bool lastrec_valid; + union xfs_btree_rec lastrec; + struct list_head to_check; + + /* this element must come last! */ + struct xchk_btree_key lastkey[]; +}; + +/* + * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for + * keys because we track leaf records separately in lastrec. + */ +static inline size_t +xchk_btree_sizeof(unsigned int nlevels) +{ + return struct_size_t(struct xchk_btree, lastkey, nlevels - 1); +} + +int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo, + void *private); + +#endif /* __XFS_SCRUB_BTREE_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c new file mode 100644 index 0000000000..de24532fe0 --- /dev/null +++ b/fs/xfs/scrub/common.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/health.h" + +/* Common code for the metadata scrubbers. */ + +/* + * Handling operational errors. + * + * The *_process_error() family of functions are used to process error return + * codes from functions called as part of a scrub operation. + * + * If there's no error, we return true to tell the caller that it's ok + * to move on to the next check in its list. + * + * For non-verifier errors (e.g. ENOMEM) we return false to tell the + * caller that something bad happened, and we preserve *error so that + * the caller can return the *error up the stack to userspace. + * + * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting + * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, + * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, + * not via return codes. We return false to tell the caller that + * something bad happened. Since the error has been cleared, the caller + * will (presumably) return that zero and scrubbing will move on to + * whatever's next. + * + * ftrace can be used to record the precise metadata location and the + * approximate code location of the failed operation. + */ + +/* Check for operational errors. */ +static bool +__xchk_process_error( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error, + __u32 errflag, + void *ret_ip) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + case -ECHRNG: + /* Used to restart an op with deadlock avoidance. */ + trace_xchk_deadlock_retry( + sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), + sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= errflag; + *error = 0; + fallthrough; + default: + trace_xchk_op_error(sc, agno, bno, *error, + ret_ip); + break; + } + return false; +} + +bool +xchk_process_error( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xchk_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xchk_xref_process_error( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xchk_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + +/* Check for operational errors for a file offset. */ +static bool +__xchk_fblock_process_error( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset, + int *error, + __u32 errflag, + void *ret_ip) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + case -ECHRNG: + /* Used to restart an op with deadlock avoidance. */ + trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= errflag; + *error = 0; + fallthrough; + default: + trace_xchk_file_op_error(sc, whichfork, offset, *error, + ret_ip); + break; + } + return false; +} + +bool +xchk_fblock_process_error( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xchk_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xchk_fblock_xref_process_error( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xchk_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + +/* + * Handling scrub corruption/optimization/warning checks. + * + * The *_set_{corrupt,preen,warning}() family of functions are used to + * record the presence of metadata that is incorrect (corrupt), could be + * optimized somehow (preen), or should be flagged for administrative + * review but is not incorrect (warn). + * + * ftrace can be used to record the precise metadata location and + * approximate code location of the failed check. + */ + +/* Record a block which could be optimized. */ +void +xchk_block_set_preen( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); +} + +/* + * Record an inode which could be optimized. The trace data will + * include the block given by bp if bp is given; otherwise it will use + * the block location of the inode record itself. + */ +void +xchk_ino_set_preen( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_ino_preen(sc, ino, __return_address); +} + +/* Record something being wrong with the filesystem primary superblock. */ +void +xchk_set_corrupt( + struct xfs_scrub *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_fs_error(sc, 0, __return_address); +} + +/* Record a corrupt block. */ +void +xchk_block_set_corrupt( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); +} + +/* Record a corruption while cross-referencing. */ +void +xchk_block_xref_set_corrupt( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); +} + +/* + * Record a corrupt inode. The trace data will include the block given + * by bp if bp is given; otherwise it will use the block location of the + * inode record itself. + */ +void +xchk_ino_set_corrupt( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_ino_error(sc, ino, __return_address); +} + +/* Record a corruption while cross-referencing with an inode. */ +void +xchk_ino_xref_set_corrupt( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xchk_ino_error(sc, ino, __return_address); +} + +/* Record corruption in a block indexed by a file fork. */ +void +xchk_fblock_set_corrupt( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_fblock_error(sc, whichfork, offset, __return_address); +} + +/* Record a corruption while cross-referencing a fork block. */ +void +xchk_fblock_xref_set_corrupt( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xchk_fblock_error(sc, whichfork, offset, __return_address); +} + +/* + * Warn about inodes that need administrative review but is not + * incorrect. + */ +void +xchk_ino_set_warning( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xchk_ino_warning(sc, ino, __return_address); +} + +/* Warn about a block indexed by a file fork that needs review. */ +void +xchk_fblock_set_warning( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xchk_fblock_warning(sc, whichfork, offset, __return_address); +} + +/* Signal an incomplete scrub. */ +void +xchk_set_incomplete( + struct xfs_scrub *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; + trace_xchk_incomplete(sc, __return_address); +} + +/* + * rmap scrubbing -- compute the number of blocks with a given owner, + * at least according to the reverse mapping data. + */ + +struct xchk_rmap_ownedby_info { + const struct xfs_owner_info *oinfo; + xfs_filblks_t *blocks; +}; + +STATIC int +xchk_count_rmap_ownedby_irec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rmap_ownedby_info *sroi = priv; + bool irec_attr; + bool oinfo_attr; + + irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; + oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; + + if (rec->rm_owner != sroi->oinfo->oi_owner) + return 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) + (*sroi->blocks) += rec->rm_blockcount; + + return 0; +} + +/* + * Calculate the number of blocks the rmap thinks are owned by something. + * The caller should pass us an rmapbt cursor. + */ +int +xchk_count_rmap_ownedby_ag( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + const struct xfs_owner_info *oinfo, + xfs_filblks_t *blocks) +{ + struct xchk_rmap_ownedby_info sroi = { + .oinfo = oinfo, + .blocks = blocks, + }; + + *blocks = 0; + return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, + &sroi); +} + +/* + * AG scrubbing + * + * These helpers facilitate locking an allocation group's header + * buffers, setting up cursors for all btrees that are present, and + * cleaning everything up once we're through. + */ + +/* Decide if we want to return an AG header read failure. */ +static inline bool +want_ag_read_header_failure( + struct xfs_scrub *sc, + unsigned int type) +{ + /* Return all AG header read failures when scanning btrees. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) + return true; + /* + * If we're scanning a given type of AG header, we only want to + * see read failures from that specific header. We'd like the + * other headers to cross-check them, but this isn't required. + */ + if (sc->sm->sm_type == type) + return true; + return false; +} + +/* + * Grab the AG header buffers for the attached perag structure. + * + * The headers should be released by xchk_ag_free, but as a fail safe we attach + * all the buffers we grab to the scrub transaction so they'll all be freed + * when we cancel it. + */ +static inline int +xchk_perag_read_headers( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + int error; + + error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) + return error; + + error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) + return error; + + return 0; +} + +/* + * Grab the AG headers for the attached perag structure and wait for pending + * intents to drain. + */ +static int +xchk_perag_drain_and_lock( + struct xfs_scrub *sc) +{ + struct xchk_ag *sa = &sc->sa; + int error = 0; + + ASSERT(sa->pag != NULL); + ASSERT(sa->agi_bp == NULL); + ASSERT(sa->agf_bp == NULL); + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + error = xchk_perag_read_headers(sc, sa); + if (error) + return error; + + /* + * If we've grabbed an inode for scrubbing then we assume that + * holding its ILOCK will suffice to coordinate with any intent + * chains involving this inode. + */ + if (sc->ip) + return 0; + + /* + * Decide if this AG is quiet enough for all metadata to be + * consistent with each other. XFS allows the AG header buffer + * locks to cycle across transaction rolls while processing + * chains of deferred ops, which means that there could be + * other threads in the middle of processing a chain of + * deferred ops. For regular operations we are careful about + * ordering operations to prevent collisions between threads + * (which is why we don't need a per-AG lock), but scrub and + * repair have to serialize against chained operations. + * + * We just locked all the AG headers buffers; now take a look + * to see if there are any intents in progress. If there are, + * drop the AG headers and wait for the intents to drain. + * Since we hold all the AG header locks for the duration of + * the scrub, this is the only time we have to sample the + * intents counter; any threads increasing it after this point + * can't possibly be in the middle of a chain of AG metadata + * updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_perag_intent_busy(sa->pag)) + return 0; + + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_perag_intent_drain(sa->pag); + if (error == -ERESTARTSYS) + error = -EINTR; + } while (!error); + + return error; +} + +/* + * Grab the per-AG structure, grab all AG header buffers, and wait until there + * aren't any pending intents. Returns -ENOENT if we can't grab the perag + * structure. + */ +int +xchk_ag_read_headers( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; + + return xchk_perag_drain_and_lock(sc); +} + +/* Release all the AG btree cursors. */ +void +xchk_ag_btcur_free( + struct xchk_ag *sa) +{ + if (sa->refc_cur) + xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); + if (sa->rmap_cur) + xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); + if (sa->fino_cur) + xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); + if (sa->ino_cur) + xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); + if (sa->cnt_cur) + xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); + if (sa->bno_cur) + xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); + + sa->refc_cur = NULL; + sa->rmap_cur = NULL; + sa->fino_cur = NULL; + sa->ino_cur = NULL; + sa->bno_cur = NULL; + sa->cnt_cur = NULL; +} + +/* Initialize all the btree cursors for an AG. */ +void +xchk_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { + /* Set up a bnobt cursor for cross-referencing. */ + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag, XFS_BTNUM_BNO); + } + + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { + /* Set up a cntbt cursor for cross-referencing. */ + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag, XFS_BTNUM_CNT); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sa->agi_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_INO); + } + + /* Set up a finobt cursor for cross-referencing. */ + if (sa->agi_bp && xfs_has_finobt(mp) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { + sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_FINO); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_has_rmapbt(mp) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_has_reflink(mp) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + } +} + +/* Release the AG header context and btree cursors. */ +void +xchk_ag_free( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + xchk_ag_btcur_free(sa); + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + if (sa->pag) { + xfs_perag_put(sa->pag); + sa->pag = NULL; + } +} + +/* + * For scrub, grab the perag structure, the AGI, and the AGF headers, in that + * order. Locking order requires us to get the AGI before the AGF. We use the + * transaction to avoid deadlocking on crosslinked metadata buffers; either the + * caller passes one in (bmap scrub) or we have to create a transaction + * ourselves. Returns ENOENT if the perag struct cannot be grabbed. + */ +int +xchk_ag_init( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + int error; + + error = xchk_ag_read_headers(sc, agno, sa); + if (error) + return error; + + xchk_ag_btcur_init(sc, sa); + return 0; +} + +/* Per-scrubber setup functions */ + +void +xchk_trans_cancel( + struct xfs_scrub *sc) +{ + xfs_trans_cancel(sc->tp); + sc->tp = NULL; +} + +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + * + * If we're going to repair something, we need to ask for the largest possible + * log reservation so that we can handle the worst case scenario for metadata + * updates while rebuilding a metadata item. We also need to reserve as many + * blocks in the head transaction as we think we're going to need to rebuild + * the metadata object. + */ +int +xchk_trans_alloc( + struct xfs_scrub *sc, + uint resblks) +{ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + resblks, 0, 0, &sc->tp); + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + +/* Set us up with a transaction and an empty context. */ +int +xchk_setup_fs( + struct xfs_scrub *sc) +{ + uint resblks; + + resblks = xrep_calc_ag_resblks(sc); + return xchk_trans_alloc(sc, resblks); +} + +/* Set us up with AG headers and btree cursors. */ +int +xchk_setup_ag_btree( + struct xfs_scrub *sc, + bool force_log) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * If the caller asks us to checkpont the log, do so. This + * expensive operation should be performed infrequently and only + * as a last resort. Any caller that sets force_log should + * document why they need to do so. + */ + if (force_log) { + error = xchk_checkpoint_log(mp); + if (error) + return error; + } + + error = xchk_setup_fs(sc); + if (error) + return error; + + return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa); +} + +/* Push everything out of the log onto disk. */ +int +xchk_checkpoint_log( + struct xfs_mount *mp) +{ + int error; + + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + xfs_ail_push_all_sync(mp->m_ail); + return 0; +} + +/* Verify that an inode is allocated ondisk, then return its cached inode. */ +int +xchk_iget( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_inode **ipp) +{ + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); +} + +/* + * Try to grab an inode in a manner that avoids races with physical inode + * allocation. If we can't, return the locked AGI buffer so that the caller + * can single-step the loading process to see where things went wrong. + * Callers must have a valid scrub transaction. + * + * If the iget succeeds, return 0, a NULL AGI, and the inode. + * + * If the iget fails, return the error, the locked AGI, and a NULL inode. This + * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are + * no longer allocated; or any other corruption or runtime error. + * + * If the AGI read fails, return the error, a NULL AGI, and NULL inode. + * + * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. + */ +int +xchk_iget_agi( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_buf **agi_bpp, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_perag *pag; + int error; + + ASSERT(sc->tp != NULL); + +again: + *agi_bpp = NULL; + *ipp = NULL; + error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Attach the AGI buffer to the scrub transaction to avoid deadlocks + * in the iget cache miss path. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + xfs_perag_put(pag); + if (error) + return error; + + error = xfs_iget(mp, tp, inum, + XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + if (error == -EAGAIN) { + /* + * The inode may be in core but temporarily unavailable and may + * require the AGI buffer before it can be returned. Drop the + * AGI buffer and retry the lookup. + * + * Incore lookup will fail with EAGAIN on a cache hit if the + * inode is queued to the inactivation list. The inactivation + * worker may remove the inode from the unlinked list and hence + * needs the AGI. + * + * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN + * to allow inodegc to make progress and move the inode to + * IRECLAIMABLE state where xfs_iget will be able to return it + * again if it can lock the inode. + */ + xfs_trans_brelse(tp, *agi_bpp); + delay(1); + goto again; + } + if (error) + return error; + + /* We got the inode, so we can release the AGI. */ + ASSERT(*ipp != NULL); + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + return 0; +} + +/* Install an inode that we opened by handle for scrubbing. */ +int +xchk_install_handle_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + xchk_irele(sc, ip); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + +/* + * Install an already-referenced inode for scrubbing. Get our own reference to + * the inode to make disposal simpler. The inode must not be in I_FREEING or + * I_WILL_FREE state! + */ +int +xchk_install_live_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!igrab(VFS_I(ip))) { + xchk_ino_set_corrupt(sc, ip->i_ino); + return -EFSCORRUPTED; + } + + sc->ip = ip; + return 0; +} + +/* + * In preparation to scrub metadata structures that hang off of an inode, + * grab either the inode referenced in the scrub control structure or the + * inode passed in. If the inumber does not reference an allocated inode + * record, the function returns ENOENT to end the scrub early. The inode + * is not locked. + */ +int +xchk_iget_for_scrubbing( + struct xfs_scrub *sc) +{ + struct xfs_imap imap; + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + struct xfs_buf *agi_bp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + struct xfs_inode *ip = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); + int error; + + ASSERT(sc->tp == NULL); + + /* We want to scan the inode we already had opened. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) + return xchk_install_live_inode(sc, ip_in); + + /* Reject internal metadata files and obviously bad inode numbers. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_inode(sc, ip); + if (error == -ENOENT) + return error; + if (error != -EINVAL) + goto out_error; + + /* + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is a + * record, and it says this inode is free. + * + * We want to look up this inode in the inobt to distinguish two + * scenarios: (1) the inobt says the inode is free, in which case + * there's nothing to do; and (2) the inobt says the inode is + * allocated, but loading it failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity + * in this AG. Retry the iget in case someone allocated a new inode + * after the first iget failed. + */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the inode, so install it. */ + xchk_trans_cancel(sc); + return xchk_install_handle_inode(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; + } + + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt thinks this the inode neither can exist inside the + * filesystem nor is allocated, return ENOENT to signal that the check + * can be skipped. + * + * If the lookup returns corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters any other error, exit to userspace. + * + * If the lookup succeeds, something else must be very wrong in the fs + * such that setting up the incore inode failed in some strange way. + * Treat those as corruptions. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; + if (!error) + error = -EFSCORRUPTED; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; +} + +/* Release an inode, possibly dropping it in the process. */ +void +xchk_irele( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (current->journal_info != NULL) { + ASSERT(current->journal_info == sc->tp); + + /* + * If we are in a transaction, we /cannot/ drop the inode + * ourselves, because the VFS will trigger writeback, which + * can require a transaction. Clear DONTCACHE to force the + * inode to the LRU, where someone else can take care of + * dropping it. + * + * Note that when we grabbed our reference to the inode, it + * could have had an active ref and DONTCACHE set if a sysadmin + * is trying to coerce a change in file access mode. icache + * hits do not clear DONTCACHE, so we must do it here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); + } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { + /* + * If this is the last reference to the inode and the caller + * permits it, set DONTCACHE to avoid thrashing. + */ + d_mark_dontcache(VFS_I(ip)); + } + + xfs_irele(ip); +} + +/* + * Set us up to scrub metadata mapped by a file's fork. Callers must not use + * this to operate on user-accessible regular file data because the MMAPLOCK is + * not taken. + */ +int +xchk_setup_inode_contents( + struct xfs_scrub *sc, + unsigned int resblks) +{ + int error; + + error = xchk_iget_for_scrubbing(sc); + if (error) + return error; + + /* Lock the inode so the VFS cannot touch this file. */ + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xchk_trans_alloc(sc, resblks); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); +out: + /* scrub teardown will unlock and release the inode for us */ + return error; +} + +void +xchk_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_ilock(sc->ip, ilock_flags); + sc->ilock_flags |= ilock_flags; +} + +bool +xchk_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->ip, ilock_flags)) { + sc->ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xchk_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->ilock_flags &= ~ilock_flags; + xfs_iunlock(sc->ip, ilock_flags); +} + +/* + * Predicate that decides if we need to evaluate the cross-reference check. + * If there was an error accessing the cross-reference btree, just delete + * the cursor and skip the check. + */ +bool +xchk_should_check_xref( + struct xfs_scrub *sc, + int *error, + struct xfs_btree_cur **curpp) +{ + /* No point in xref if we already know we're corrupt. */ + if (xchk_skip_xref(sc->sm)) + return false; + + if (*error == 0) + return true; + + if (curpp) { + /* If we've already given up on xref, just bail out. */ + if (!*curpp) + return false; + + /* xref error, delete cursor and bail out. */ + xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); + *curpp = NULL; + } + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + trace_xchk_xref_error(sc, *error, __return_address); + + /* + * Errors encountered during cross-referencing with another + * data structure should not cause this scrubber to abort. + */ + *error = 0; + return false; +} + +/* Run the structure verifiers on in-memory buffers to detect bad memory. */ +void +xchk_buffer_recheck( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (bp->b_ops == NULL) { + xchk_block_set_corrupt(sc, bp); + return; + } + if (bp->b_ops->verify_struct == NULL) { + xchk_set_incomplete(sc); + return; + } + fa = bp->b_ops->verify_struct(bp); + if (!fa) + return; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); +} + +static inline int +xchk_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + int error; + + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + break; + } + + sc->sm->sm_type = smtype; + return error; +} + +/* + * Scrub the attr/data forks of a metadata inode. The metadata inode must be + * pointed to by sc->ip and the ILOCK must be held. + */ +int +xchk_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool shared; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Check the inode record. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Metadata inodes don't live on the rt device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They should never participate in reflink. */ + if (xfs_is_reflink_inode(sc->ip)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They also should never have extended attributes. */ + if (xfs_inode_hasattr(sc->ip)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* Invoke the data fork scrubber. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Look for incorrect shared blocks. */ + if (xfs_has_reflink(sc->mp)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &shared); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + if (shared) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return 0; +} + +/* + * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub + * operation. Callers must not hold any locks that intersect with the CPU + * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs + * to change kernel code. + */ +void +xchk_fsgates_enable( + struct xfs_scrub *sc, + unsigned int scrub_fsgates) +{ + ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); + ASSERT(!(sc->flags & scrub_fsgates)); + + trace_xchk_fsgates_enable(sc, scrub_fsgates); + + if (scrub_fsgates & XCHK_FSGATES_DRAIN) + xfs_drain_wait_enable(); + + sc->flags |= scrub_fsgates; +} + +/* + * Decide if this is this a cached inode that's also allocated. The caller + * must hold a reference to an AG and the AGI buffer lock to prevent inodes + * from being allocated or freed. + * + * Look up an inode by number in the given file system. If the inode number + * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. + * If the inode is being reclaimed, return -ENODATA because we know the inode + * cache cannot be updating the ondisk metadata. + * + * Otherwise, the incore inode is the one we want, and it is either live, + * somewhere in the inactivation machinery, or reclaimable. The inode is + * allocated if i_mode is nonzero. In all three cases, the cached inode will + * be more up to date than the ondisk inode buffer, so we must use the incore + * i_mode. + */ +int +xchk_inode_is_allocated( + struct xfs_scrub *sc, + xfs_agino_t agino, + bool *inuse) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + xfs_ino_t ino; + struct xfs_inode *ip; + int error; + + /* caller must hold perag reference */ + if (pag == NULL) { + ASSERT(pag != NULL); + return -EINVAL; + } + + /* caller must have AGI buffer */ + if (sc->sa.agi_bp == NULL) { + ASSERT(sc->sa.agi_bp != NULL); + return -EINVAL; + } + + /* reject inode numbers outside existing AGs */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + if (!xfs_verify_ino(mp, ino)) + return -EINVAL; + + error = -ENODATA; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* cache miss */ + goto out_rcu; + } + + /* + * If the inode number doesn't match, the incore inode got reused + * during an RCU grace period and the radix tree hasn't been updated. + * This isn't the inode we want. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) + goto out_skip; + + trace_xchk_inode_is_allocated(ip); + + /* + * We have an incore inode that matches the inode we want, and the + * caller holds the perag structure and the AGI buffer. Let's check + * our assumptions below: + */ + +#ifdef DEBUG + /* + * (1) If the incore inode is live (i.e. referenced from the dcache), + * it will not be INEW, nor will it be in the inactivation or reclaim + * machinery. The ondisk inode had better be allocated. This is the + * most trivial case. + */ + if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | + XFS_INACTIVATING))) { + /* live inode */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * If the incore inode is INEW, there are several possibilities: + * + * (2) For a file that is being created, note that we allocate the + * ondisk inode before allocating, initializing, and adding the incore + * inode to the radix tree. + * + * (3) If the incore inode is being recycled, the inode has to be + * allocated because we don't allow freed inodes to be recycled. + * Recycling doesn't touch i_mode. + */ + if (ip->i_flags & XFS_INEW) { + /* created on disk already or recycling */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * (4) If the inode is queued for inactivation (NEED_INACTIVE) but + * inactivation has not started (!INACTIVATING), it is still allocated. + */ + if ((ip->i_flags & XFS_NEED_INACTIVE) && + !(ip->i_flags & XFS_INACTIVATING)) { + /* definitely before difree */ + ASSERT(VFS_I(ip)->i_mode != 0); + } +#endif + + /* + * If the incore inode is undergoing inactivation (INACTIVATING), there + * are two possibilities: + * + * (5) It is before the point where it would get freed ondisk, in which + * case i_mode is still nonzero. + * + * (6) It has already been freed, in which case i_mode is zero. + * + * We don't take the ILOCK here, but difree and dialloc update the AGI, + * and we've taken the AGI buffer lock, which prevents that from + * happening. + */ + + /* + * (7) Inodes undergoing inactivation (INACTIVATING) or queued for + * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still + * reflects the ondisk state. + */ + + /* + * (8) If the inode is in IFLUSHING, it's safe to query i_mode because + * the flush code uses i_mode to format the ondisk inode. + */ + + /* + * (9) If the inode is in IRECLAIM and was reachable via the radix + * tree, it still has the same i_mode as it did before it entered + * reclaim. The inode object is still alive because we hold the RCU + * read lock. + */ + + *inuse = VFS_I(ip)->i_mode != 0; + error = 0; + +out_skip: + spin_unlock(&ip->i_flags_lock); +out_rcu: + rcu_read_unlock(); + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h new file mode 100644 index 0000000000..cabdc0e168 --- /dev/null +++ b/fs/xfs/scrub/common.h @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_COMMON_H__ +#define __XFS_SCRUB_COMMON_H__ + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xchk_should_terminate( + struct xfs_scrub *sc, + int *error) +{ + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + + if (fatal_signal_pending(current)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +void xchk_trans_cancel(struct xfs_scrub *sc); + +bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int *error); +bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t offset, int *error); + +bool xchk_xref_process_error(struct xfs_scrub *sc, + xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xchk_fblock_xref_process_error(struct xfs_scrub *sc, + int whichfork, xfs_fileoff_t offset, int *error); + +void xchk_block_set_preen(struct xfs_scrub *sc, + struct xfs_buf *bp); +void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); + +void xchk_set_corrupt(struct xfs_scrub *sc); +void xchk_block_set_corrupt(struct xfs_scrub *sc, + struct xfs_buf *bp); +void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t offset); + +void xchk_block_xref_set_corrupt(struct xfs_scrub *sc, + struct xfs_buf *bp); +void xchk_ino_xref_set_corrupt(struct xfs_scrub *sc, + xfs_ino_t ino); +void xchk_fblock_xref_set_corrupt(struct xfs_scrub *sc, + int whichfork, xfs_fileoff_t offset); + +void xchk_ino_set_warning(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_fblock_set_warning(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t offset); + +void xchk_set_incomplete(struct xfs_scrub *sc); +int xchk_checkpoint_log(struct xfs_mount *mp); + +/* Are we set up for a cross-referencing check? */ +bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, + struct xfs_btree_cur **curpp); + +/* Setup functions */ +int xchk_setup_agheader(struct xfs_scrub *sc); +int xchk_setup_fs(struct xfs_scrub *sc); +int xchk_setup_ag_allocbt(struct xfs_scrub *sc); +int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); +int xchk_setup_ag_rmapbt(struct xfs_scrub *sc); +int xchk_setup_ag_refcountbt(struct xfs_scrub *sc); +int xchk_setup_inode(struct xfs_scrub *sc); +int xchk_setup_inode_bmap(struct xfs_scrub *sc); +int xchk_setup_inode_bmap_data(struct xfs_scrub *sc); +int xchk_setup_directory(struct xfs_scrub *sc); +int xchk_setup_xattr(struct xfs_scrub *sc); +int xchk_setup_symlink(struct xfs_scrub *sc); +int xchk_setup_parent(struct xfs_scrub *sc); +#ifdef CONFIG_XFS_RT +int xchk_setup_rtbitmap(struct xfs_scrub *sc); +int xchk_setup_rtsummary(struct xfs_scrub *sc); +#else +static inline int +xchk_setup_rtbitmap(struct xfs_scrub *sc) +{ + return -ENOENT; +} +static inline int +xchk_setup_rtsummary(struct xfs_scrub *sc) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xchk_setup_quota(struct xfs_scrub *sc); +#else +static inline int +xchk_setup_quota(struct xfs_scrub *sc) +{ + return -ENOENT; +} +#endif +int xchk_setup_fscounters(struct xfs_scrub *sc); + +void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); +int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, + struct xchk_ag *sa); + +/* + * Grab all AG resources, treating the inability to grab the perag structure as + * a fs corruption. This is intended for callers checking an ondisk reference + * to a given AG, which means that the AG must still exist. + */ +static inline int +xchk_ag_init_existing( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + int error = xchk_ag_init(sc, agno, sa); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + +int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, + struct xchk_ag *sa); +void xchk_ag_btcur_free(struct xchk_ag *sa); +void xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); + +int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); +int xchk_iget_for_scrubbing(struct xfs_scrub *sc); +int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); +int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + +void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags); +void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + +void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); + +int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); +int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, + struct xfs_buf **agi_bpp, struct xfs_inode **ipp); +void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + +/* + * Don't bother cross-referencing if we already found corruption or cross + * referencing discrepancies. + */ +static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT); +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Decide if a repair is required. */ +static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN); +} +#else +# define xchk_needs_repair(sc) (false) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +int xchk_metadata_inode_forks(struct xfs_scrub *sc); + +/* + * Helper macros to allocate and format xfile description strings. + * Callers must kfree the pointer returned. + */ +#define xchk_xfile_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ + (sc)->mp->m_super->s_id, ##__VA_ARGS__) + +/* + * Setting up a hook to wait for intents to drain is costly -- we have to take + * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it + * up, and again to tear it down. These costs add up quickly, so we only want + * to enable the drain waiter if the drain actually detected a conflict with + * running intent chains. + */ +static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) +{ + return sc->flags & XCHK_NEED_DRAIN; +} + +void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); + +int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, + bool *inuse); + +#endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c new file mode 100644 index 0000000000..82b150d3b8 --- /dev/null +++ b/fs/xfs/scrub/dabtree.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/dabtree.h" + +/* Directory/Attribute Btree */ + +/* + * Check for da btree operation errors. See the section about handling + * operational errors in common.c. + */ +bool +xchk_da_process_error( + struct xchk_da_btree *ds, + int level, + int *error) +{ + struct xfs_scrub *sc = ds->sc; + + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + case -ECHRNG: + /* Used to restart an op with deadlock avoidance. */ + trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + fallthrough; + default: + trace_xchk_file_op_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + *error, __return_address); + break; + } + return false; +} + +/* + * Check for da btree corruption. See the section about handling + * operational errors in common.c. + */ +void +xchk_da_set_corrupt( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_scrub *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + trace_xchk_fblock_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +static struct xfs_da_node_entry * +xchk_da_btree_node_entry( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_da3_icnode_hdr hdr; + + ASSERT(blk->magic == XFS_DA_NODE_MAGIC); + + xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr); + return hdr.btree + blk->index; +} + +/* Scrub a da btree hash (key). */ +int +xchk_da_btree_hash( + struct xchk_da_btree *ds, + int level, + __be32 *hashp) +{ + struct xfs_da_node_entry *entry; + xfs_dahash_t hash; + xfs_dahash_t parent_hash; + + /* Is this hash in order? */ + hash = be32_to_cpu(*hashp); + if (hash < ds->hashes[level]) + xchk_da_set_corrupt(ds, level); + ds->hashes[level] = hash; + + if (level == 0) + return 0; + + /* Is this hash no larger than the parent hash? */ + entry = xchk_da_btree_node_entry(ds, level - 1); + parent_hash = be32_to_cpu(entry->hashval); + if (parent_hash < hash) + xchk_da_set_corrupt(ds, level); + + return 0; +} + +/* + * Check a da btree pointer. Returns true if it's ok to use this + * pointer. + */ +STATIC bool +xchk_da_btree_ptr_ok( + struct xchk_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) { + xchk_da_set_corrupt(ds, level); + return false; + } + + return true; +} + +/* + * The da btree scrubber can handle leaf1 blocks as a degenerate + * form of leafn blocks. Since the regular da code doesn't handle + * leaf1, we must multiplex the verifiers. + */ +static void +xchk_da_btree_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_read(bp); + return; + } +} +static void +xchk_da_btree_write_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_write(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_write(bp); + return; + } +} +static void * +xchk_da_btree_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + return bp->b_ops->verify_struct(bp); + default: + bp->b_ops = &xfs_da3_node_buf_ops; + return bp->b_ops->verify_struct(bp); + } +} + +static const struct xfs_buf_ops xchk_da_btree_buf_ops = { + .name = "xchk_da_btree", + .verify_read = xchk_da_btree_read_verify, + .verify_write = xchk_da_btree_write_verify, + .verify_struct = xchk_da_btree_verify, +}; + +/* Check a block's sibling. */ +STATIC int +xchk_da_btree_block_check_sibling( + struct xchk_da_btree *ds, + int level, + int direction, + xfs_dablk_t sibling) +{ + struct xfs_da_state_path *path = &ds->state->path; + struct xfs_da_state_path *altpath = &ds->state->altpath; + int retval; + int plevel; + int error; + + memcpy(altpath, path, sizeof(ds->state->altpath)); + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (sibling == 0) { + error = xfs_da3_path_shift(ds->state, altpath, direction, + false, &retval); + if (error == 0 && retval == 0) + xchk_da_set_corrupt(ds, level); + error = 0; + goto out; + } + + /* Move the alternate cursor one block in the direction given. */ + error = xfs_da3_path_shift(ds->state, altpath, direction, false, + &retval); + if (!xchk_da_process_error(ds, level, &error)) + goto out; + if (retval) { + xchk_da_set_corrupt(ds, level); + goto out; + } + if (altpath->blk[level].bp) + xchk_buffer_recheck(ds->sc, altpath->blk[level].bp); + + /* Compare upper level pointer to sibling pointer. */ + if (altpath->blk[level].blkno != sibling) + xchk_da_set_corrupt(ds, level); + +out: + /* Free all buffers in the altpath that aren't referenced from path. */ + for (plevel = 0; plevel < altpath->active; plevel++) { + if (altpath->blk[plevel].bp == NULL || + (plevel < path->active && + altpath->blk[plevel].bp == path->blk[plevel].bp)) + continue; + + xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp); + altpath->blk[plevel].bp = NULL; + } + + return error; +} + +/* Check a block's sibling pointers. */ +STATIC int +xchk_da_btree_block_check_siblings( + struct xchk_da_btree *ds, + int level, + struct xfs_da_blkinfo *hdr) +{ + xfs_dablk_t forw; + xfs_dablk_t back; + int error = 0; + + forw = be32_to_cpu(hdr->forw); + back = be32_to_cpu(hdr->back); + + /* Top level blocks should not have sibling pointers. */ + if (level == 0) { + if (forw != 0 || back != 0) + xchk_da_set_corrupt(ds, level); + return 0; + } + + /* + * Check back (left) and forw (right) pointers. These functions + * absorb error codes for us. + */ + error = xchk_da_btree_block_check_sibling(ds, level, 0, back); + if (error) + goto out; + error = xchk_da_btree_block_check_sibling(ds, level, 1, forw); + +out: + memset(&ds->state->altpath, 0, sizeof(ds->state->altpath)); + return error; +} + +/* Load a dir/attribute block from a btree. */ +STATIC int +xchk_da_btree_block( + struct xchk_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_blkinfo *hdr3; + struct xfs_da_args *dargs = &ds->dargs; + struct xfs_inode *ip = ds->dargs.dp; + xfs_ino_t owner; + int *pmaxrecs; + struct xfs_da3_icnode_hdr nodehdr; + int error = 0; + + blk = &ds->state->path.blk[level]; + ds->state->path.active = level + 1; + + /* Release old block. */ + if (blk->bp) { + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; + } + + /* Check the pointer. */ + blk->blkno = blkno; + if (!xchk_da_btree_ptr_ok(ds, level, blkno)) + goto out_nobuf; + + /* Read the buffer. */ + error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, + XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork, + &xchk_da_btree_buf_ops); + if (!xchk_da_process_error(ds, level, &error)) + goto out_nobuf; + if (blk->bp) + xchk_buffer_recheck(ds->sc, blk->bp); + + /* + * We didn't find a dir btree root block, which means that + * there's no LEAF1/LEAFN tree (at least not where it's supposed + * to be), so jump out now. + */ + if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 && + blk->bp == NULL) + goto out_nobuf; + + /* It's /not/ ok for attr trees not to have a da btree. */ + if (blk->bp == NULL) { + xchk_da_set_corrupt(ds, level); + goto out_nobuf; + } + + hdr3 = blk->bp->b_addr; + blk->magic = be16_to_cpu(hdr3->hdr.magic); + pmaxrecs = &ds->maxrecs[level]; + + /* We only started zeroing the header on v5 filesystems. */ + if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad) + xchk_da_set_corrupt(ds, level); + + /* Check the owner. */ + if (xfs_has_crc(ip->i_mount)) { + owner = be64_to_cpu(hdr3->owner); + if (owner != ip->i_ino) + xchk_da_set_corrupt(ds, level); + } + + /* Check the siblings. */ + error = xchk_da_btree_block_check_siblings(ds, level, &hdr3->hdr); + if (error) + goto out; + + /* Interpret the buffer. */ + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_ATTR_LEAF_BUF); + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xchk_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAFN_BUF); + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xchk_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAF1_BUF); + blk->magic = XFS_DIR2_LEAF1_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xchk_da_set_corrupt(ds, level); + break; + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DA_NODE_BUF); + blk->magic = XFS_DA_NODE_MAGIC; + node = blk->bp->b_addr; + xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node); + btree = nodehdr.btree; + *pmaxrecs = nodehdr.count; + blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval); + if (level == 0) { + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + ds->tree_level = nodehdr.level; + } else { + if (ds->tree_level != nodehdr.level) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + } + + /* XXX: Check hdr3.pad32 once we know how to fix it. */ + break; + default: + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + + /* + * If we've been handed a block that is below the dabtree root, does + * its hashval match what the parent block expected to see? + */ + if (level > 0) { + struct xfs_da_node_entry *key; + + key = xchk_da_btree_node_entry(ds, level - 1); + if (be32_to_cpu(key->hashval) != blk->hashval) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + } + +out: + return error; +out_freebp: + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; +out_nobuf: + blk->blkno = 0; + return error; +} + +/* Visit all nodes and leaves of a da btree. */ +int +xchk_da_btree( + struct xfs_scrub *sc, + int whichfork, + xchk_da_btree_rec_fn scrub_fn, + void *private) +{ + struct xchk_da_btree *ds; + struct xfs_mount *mp = sc->mp; + struct xfs_da_state_blk *blks; + struct xfs_da_node_entry *key; + xfs_dablk_t blkno; + int level; + int error; + + /* Skip short format data structures; no btree to scan. */ + if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork))) + return 0; + + /* Set up initial da state. */ + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); + if (!ds) + return -ENOMEM; + ds->dargs.dp = sc->ip; + ds->dargs.whichfork = whichfork; + ds->dargs.trans = sc->tp; + ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->state = xfs_da_state_alloc(&ds->dargs); + ds->sc = sc; + ds->private = private; + if (whichfork == XFS_ATTR_FORK) { + ds->dargs.geo = mp->m_attr_geo; + ds->lowest = 0; + ds->highest = 0; + } else { + ds->dargs.geo = mp->m_dir_geo; + ds->lowest = ds->dargs.geo->leafblk; + ds->highest = ds->dargs.geo->freeblk; + } + blkno = ds->lowest; + level = 0; + + /* Find the root of the da tree, if present. */ + blks = ds->state->path.blk; + error = xchk_da_btree_block(ds, level, blkno); + if (error) + goto out_state; + /* + * We didn't find a block at ds->lowest, which means that there's + * no LEAF1/LEAFN tree (at least not where it's supposed to be), + * so jump out now. + */ + if (blks[level].bp == NULL) + goto out_state; + + blks[level].index = 0; + while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) { + /* Handle leaf block. */ + if (blks[level].magic != XFS_DA_NODE_MAGIC) { + /* End of leaf, pop back towards the root. */ + if (blks[level].index >= ds->maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds->tree_level++; + level--; + continue; + } + + /* Dispatch record scrubbing. */ + error = scrub_fn(ds, level); + if (error) + break; + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + blks[level].index++; + continue; + } + + + /* End of node, pop back towards the root. */ + if (blks[level].index >= ds->maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds->tree_level++; + level--; + continue; + } + + /* Hashes in order for scrub? */ + key = xchk_da_btree_node_entry(ds, level); + error = xchk_da_btree_hash(ds, level, &key->hashval); + if (error) + goto out; + + /* Drill another level deeper. */ + blkno = be32_to_cpu(key->before); + level++; + if (level >= XFS_DA_NODE_MAXDEPTH) { + /* Too deep! */ + xchk_da_set_corrupt(ds, level - 1); + break; + } + ds->tree_level--; + error = xchk_da_btree_block(ds, level, blkno); + if (error) + goto out; + if (blks[level].bp == NULL) + goto out; + + blks[level].index = 0; + } + +out: + /* Release all the buffers we're tracking. */ + for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) { + if (blks[level].bp == NULL) + continue; + xfs_trans_brelse(sc->tp, blks[level].bp); + blks[level].bp = NULL; + } + +out_state: + xfs_da_state_free(ds->state); + kfree(ds); + return error; +} diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h new file mode 100644 index 0000000000..4f8c2138a1 --- /dev/null +++ b/fs/xfs/scrub/dabtree.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DABTREE_H__ +#define __XFS_SCRUB_DABTREE_H__ + +/* dir/attr btree */ + +struct xchk_da_btree { + struct xfs_da_args dargs; + xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH]; + int maxrecs[XFS_DA_NODE_MAXDEPTH]; + struct xfs_da_state *state; + struct xfs_scrub *sc; + void *private; + + /* + * Lowest and highest directory block address in which we expect + * to find dir/attr btree node blocks. For a directory this + * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for + * attributes there is no limit. + */ + xfs_dablk_t lowest; + xfs_dablk_t highest; + + int tree_level; +}; + +typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level); + +/* Check for da btree operation errors. */ +bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); + +/* Check for da btree corruption. */ +void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level); + +int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp); +int xchk_da_btree(struct xfs_scrub *sc, int whichfork, + xchk_da_btree_rec_fn scrub_fn, void *private); + +#endif /* __XFS_SCRUB_DABTREE_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c new file mode 100644 index 0000000000..0b491784b7 --- /dev/null +++ b/fs/xfs/scrub/dir.c @@ -0,0 +1,790 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/dabtree.h" +#include "scrub/readdir.h" + +/* Set us up to scrub directories. */ +int +xchk_setup_directory( + struct xfs_scrub *sc) +{ + return xchk_setup_inode_contents(sc, 0); +} + +/* Directories */ + +/* Scrub a directory entry. */ + +/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ +STATIC void +xchk_dir_check_ftype( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + struct xfs_inode *ip, + int ftype) +{ + struct xfs_mount *mp = sc->mp; + + if (!xfs_has_ftype(mp)) { + if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return; + } + + if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); +} + +/* + * Scrub a single directory entry. + * + * Check the inode number to make sure it's sane, then we check that we can + * look up this filename. Finally, we check the ftype. + */ +STATIC int +xchk_dir_actor( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip; + xfs_ino_t lookup_ino; + xfs_dablk_t offset; + int error = 0; + + offset = xfs_dir2_db_to_da(mp->m_dir_geo, + xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos)); + + if (xchk_should_terminate(sc, &error)) + return error; + + /* Does this inode number make sense? */ + if (!xfs_verify_dir_ino(mp, ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; + } + + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name->name, name->len)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; + } + + if (!strncmp(".", name->name, name->len)) { + /* If this is "." then check that the inum matches the dir. */ + if (ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else if (!strncmp("..", name->name, name->len)) { + /* + * If this is ".." in the root inode, check that the inum + * matches this dir. + */ + if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } + + /* Verify that we can look up this name by hash. */ + error = xchk_dir_lookup(sc, dp, name, &lookup_ino); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + goto out; + if (lookup_ino != ino) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; + } + + /* + * Grab the inode pointed to by the dirent. We release the inode + * before we cancel the scrub transaction. + * + * If _iget returns -EINVAL or -ENOENT then the child inode number is + * garbage and the directory is corrupt. If the _iget returns + * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a + * cross referencing error. Any other error is an operational error. + */ + error = xchk_iget(sc, ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + error = -EFSCORRUPTED; + xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error)) + goto out; + + xchk_dir_check_ftype(sc, offset, ip, name->type); + xchk_irele(sc, ip); +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return error; +} + +/* Scrub a directory btree record. */ +STATIC int +xchk_dir_rec( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_name dname = { }; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_mount *mp = ds->state->mp; + struct xfs_inode *dp = ds->dargs.dp; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_data_entry *dent; + struct xfs_buf *bp; + struct xfs_dir2_leaf_entry *ent; + unsigned int end; + unsigned int iter_off; + xfs_ino_t ino; + xfs_dablk_t rec_bno; + xfs_dir2_db_t db; + xfs_dir2_data_aoff_t off; + xfs_dir2_dataptr_t ptr; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + struct xfs_dir3_icleaf_hdr hdr; + unsigned int tag; + int error; + + ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr); + ent = hdr.ents + blk->index; + + /* Check the hash of the entry. */ + error = xchk_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Valid hash pointer? */ + ptr = be32_to_cpu(ent->address); + if (ptr == 0) + return 0; + + /* Find the directory entry's location. */ + db = xfs_dir2_dataptr_to_db(geo, ptr); + off = xfs_dir2_dataptr_to_off(geo, ptr); + rec_bno = xfs_dir2_db_to_da(geo, db); + + if (rec_bno >= geo->leafblk) { + xchk_da_set_corrupt(ds, level); + goto out; + } + error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, + XFS_DABUF_MAP_HOLE_OK, &bp); + if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, + &error)) + goto out; + if (!bp) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out; + } + xchk_buffer_recheck(ds->sc, bp); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_relse; + + dent = bp->b_addr + off; + + /* Make sure we got a real directory entry. */ + iter_off = geo->data_entry_offset; + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + if (!end) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + for (;;) { + struct xfs_dir2_data_entry *dep = bp->b_addr + iter_off; + struct xfs_dir2_data_unused *dup = bp->b_addr + iter_off; + + if (iter_off >= end) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + iter_off += be16_to_cpu(dup->length); + continue; + } + if (dep == dent) + break; + iter_off += xfs_dir2_data_entsize(mp, dep->namelen); + } + + /* Retrieve the entry, sanity check it, and compare hashes. */ + ino = be64_to_cpu(dent->inumber); + hash = be32_to_cpu(ent->hashval); + tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent)); + if (!xfs_verify_dir_ino(mp, ino) || tag != off) + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + if (dent->namelen == 0) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + + /* Does the directory hash match? */ + dname.name = dent->name; + dname.len = dent->namelen; + calc_hash = xfs_dir2_hashname(mp, &dname); + if (calc_hash != hash) + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + +out_relse: + xfs_trans_brelse(ds->dargs.trans, bp); +out: + return error; +} + +/* + * Is this unused entry either in the bestfree or smaller than all of + * them? We've already checked that the bestfrees are sorted longest to + * shortest, and that there aren't any bogus entries. + */ +STATIC void +xchk_directory_check_free_entry( + struct xfs_scrub *sc, + xfs_dablk_t lblk, + struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup) +{ + struct xfs_dir2_data_free *dfp; + unsigned int dup_length; + + dup_length = be16_to_cpu(dup->length); + + /* Unused entry is shorter than any of the bestfrees */ + if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return; + + for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--) + if (dup_length == be16_to_cpu(dfp->length)) + return; + + /* Unused entry should be in the bestfrees but wasn't found. */ + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory data block. */ +STATIC int +xchk_directory_data_bestfree( + struct xfs_scrub *sc, + xfs_dablk_t lblk, + bool is_block) +{ + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_free *dfp; + struct xfs_buf *bp; + struct xfs_dir2_data_free *bf; + struct xfs_mount *mp = sc->mp; + u16 tag; + unsigned int nr_bestfrees = 0; + unsigned int nr_frees = 0; + unsigned int smallest_bestfree; + int newlen; + unsigned int offset; + unsigned int end; + int error; + + if (is_block) { + /* dir block format */ + if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + } else { + /* dir data format */ + error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); + } + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + xchk_buffer_recheck(sc, bp); + + /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; + + /* Do the bestfrees correspond to actual free space? */ + bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr); + smallest_bestfree = UINT_MAX; + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { + offset = be16_to_cpu(dfp->offset); + if (offset == 0) + continue; + if (offset >= mp->m_dir_geo->blksize) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + dup = bp->b_addr + offset; + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + + /* bestfree doesn't match the entry it points at? */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) || + be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) || + tag != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + /* bestfree records should be ordered largest to smallest */ + if (smallest_bestfree < be16_to_cpu(dfp->length)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + smallest_bestfree = be16_to_cpu(dfp->length); + nr_bestfrees++; + } + + /* Make sure the bestfrees are actually the best free spaces. */ + offset = mp->m_dir_geo->data_entry_offset; + end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr); + + /* Iterate the entries, stopping when we hit or go past the end. */ + while (offset < end) { + dup = bp->b_addr + offset; + + /* Skip real entries */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) { + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + newlen = xfs_dir2_data_entsize(mp, dep->namelen); + if (newlen <= 0) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + lblk); + goto out_buf; + } + offset += newlen; + continue; + } + + /* Spot check this free entry */ + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + if (tag != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + /* + * Either this entry is a bestfree or it's smaller than + * any of the bestfrees. + */ + xchk_directory_check_free_entry(sc, lblk, bf, dup); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; + + /* Move on. */ + newlen = be16_to_cpu(dup->length); + if (newlen <= 0) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + offset += newlen; + if (offset <= end) + nr_frees++; + } + + /* We're required to fill all the space. */ + if (offset != end) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* Did we see at least as many free slots as there are bestfrees? */ + if (nr_frees < nr_bestfrees) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out_buf: + xfs_trans_brelse(sc->tp, bp); +out: + return error; +} + +/* + * Does the free space length in the free space index block ($len) match + * the longest length in the directory data block's bestfree array? + * Assume that we've already checked that the data block's bestfree + * array is in order. + */ +STATIC void +xchk_directory_check_freesp( + struct xfs_scrub *sc, + xfs_dablk_t lblk, + struct xfs_buf *dbp, + unsigned int len) +{ + struct xfs_dir2_data_free *dfp; + + dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr); + + if (len != be16_to_cpu(dfp->length)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + if (len > 0 && be16_to_cpu(dfp->offset) == 0) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory leaf1 block. */ +STATIC int +xchk_directory_leaf1_bestfree( + struct xfs_scrub *sc, + struct xfs_da_args *args, + xfs_dir2_db_t last_data_db, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_tail *ltp; + struct xfs_dir2_leaf *leaf; + struct xfs_buf *dbp; + struct xfs_buf *bp; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + __be16 *bestp; + __u16 best; + __u32 hash; + __u32 lasthash = 0; + __u32 bestcount; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block. */ + error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + return error; + xchk_buffer_recheck(sc, bp); + + leaf = bp->b_addr; + xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + bestcount = be32_to_cpu(ltp->bestcount); + bestp = xfs_dir2_leaf_bests_p(ltp); + + if (xfs_has_crc(sc->mp)) { + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* + * There must be enough bestfree slots to cover all the directory data + * blocks that we scanned. It is possible for there to be a hole + * between the last data block and i_disk_size. This seems like an + * oversight to the scrub author, but as we have been writing out + * directories like this (and xfs_repair doesn't mind them) for years, + * that's what we have to check. + */ + if (bestcount != last_data_db + 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Is the leaf count even remotely sane? */ + if (leafhdr.count > geo->leaf_max_ents) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Leaves and bests don't overlap in leaf format. */ + if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Check hash value order, count stale entries. */ + for (i = 0; i < leafhdr.count; i++) { + hash = be32_to_cpu(leafhdr.ents[i].hashval); + if (i > 0 && lasthash > hash) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + lasthash = hash; + if (leafhdr.ents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (leafhdr.stale != stale) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check all the bestfree entries. */ + for (i = 0; i < bestcount; i++, bestp++) { + best = be16_to_cpu(*bestp); + error = xfs_dir3_data_read(sc->tp, sc->ip, + xfs_dir2_db_to_da(args->geo, i), + XFS_DABUF_MAP_HOLE_OK, + &dbp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + break; + + if (!dbp) { + if (best != NULLDATAOFF) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + lblk); + break; + } + continue; + } + + if (best == NULLDATAOFF) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + else + xchk_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + } +out: + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Check free space info in a directory freespace block. */ +STATIC int +xchk_directory_free_bestfree( + struct xfs_scrub *sc, + struct xfs_da_args *args, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_buf *dbp; + struct xfs_buf *bp; + __u16 best; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block */ + error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + return error; + xchk_buffer_recheck(sc, bp); + + if (xfs_has_crc(sc->mp)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* Check all the entries. */ + xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr); + for (i = 0; i < freehdr.nvalid; i++) { + best = be16_to_cpu(freehdr.bests[i]); + if (best == NULLDATAOFF) { + stale++; + continue; + } + error = xfs_dir3_data_read(sc->tp, sc->ip, + (freehdr.firstdb + i) * args->geo->fsbcount, + 0, &dbp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + goto out; + xchk_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + } + + if (freehdr.nused + stale != freehdr.nvalid) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out: + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Check free space information in directories. */ +STATIC int +xchk_directory_blocks( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec got; + struct xfs_da_args args = { + .dp = sc ->ip, + .whichfork = XFS_DATA_FORK, + .geo = sc->mp->m_dir_geo, + .trans = sc->tp, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + struct xfs_mount *mp = sc->mp; + xfs_fileoff_t leaf_lblk; + xfs_fileoff_t free_lblk; + xfs_fileoff_t lblk; + struct xfs_iext_cursor icur; + xfs_dablk_t dabno; + xfs_dir2_db_t last_data_db = 0; + bool found; + bool is_block = false; + int error; + + /* Ignore local format directories. */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS && + ifp->if_format != XFS_DINODE_FMT_BTREE) + return 0; + + lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); + leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); + free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); + + /* Is this a block dir? */ + error = xfs_dir2_isblock(&args, &is_block); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + /* Iterate all the data extents in the directory... */ + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + /* No more data blocks... */ + if (got.br_startoff >= leaf_lblk) + break; + + /* + * Check each data block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + last_data_db = xfs_dir2_da_to_db(args.geo, lblk); + error = xchk_directory_data_bestfree(sc, lblk, + is_block); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Look for a leaf1 block, which has free info. */ + if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) && + got.br_startoff == leaf_lblk && + got.br_blockcount == args.geo->fsbcount && + !xfs_iext_next_extent(ifp, &icur, &got)) { + if (is_block) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db, + leaf_lblk); + if (error) + goto out; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Scan for free blocks */ + lblk = free_lblk; + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + /* + * Dirs can't have blocks mapped above 2^32. + * Single-block dirs shouldn't even be here. + */ + lblk = got.br_startoff; + if (lblk & ~0xFFFFFFFFULL) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + if (is_block) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* + * Check each dir free block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + error = xchk_directory_free_bestfree(sc, &args, + lblk); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } +out: + return error; +} + +/* Scrub a whole directory. */ +int +xchk_directory( + struct xfs_scrub *sc) +{ + int error; + + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* Plausible size? */ + if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* Check directory tree structure */ + error = xchk_da_btree(sc, XFS_DATA_FORK, xchk_dir_rec, NULL); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Check the freespace. */ + error = xchk_directory_blocks(sc); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Look up every name in this directory by hash. */ + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); + if (error == -ECANCELED) + error = 0; + return error; +} diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c new file mode 100644 index 0000000000..05be757668 --- /dev/null +++ b/fs/xfs/scrub/fscounters.c @@ -0,0 +1,606 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_health.h" +#include "xfs_btree.h" +#include "xfs_ag.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * FS Summary Counters + * =================== + * + * The basics of filesystem summary counter checking are that we iterate the + * AGs counting the number of free blocks, free space btree blocks, per-AG + * reservations, inodes, delayed allocation reservations, and free inodes. + * Then we compare what we computed against the in-core counters. + * + * However, the reality is that summary counters are a tricky beast to check. + * While we /could/ freeze the filesystem and scramble around the AGs counting + * the free blocks, in practice we prefer not do that for a scan because + * freezing is costly. To get around this, we added a per-cpu counter of the + * delalloc reservations so that we can rotor around the AGs relatively + * quickly, and we allow the counts to be slightly off because we're not taking + * any locks while we do this. + * + * So the first thing we do is warm up the buffer cache in the setup routine by + * walking all the AGs to make sure the incore per-AG structure has been + * initialized. The expected value calculation then iterates the incore per-AG + * structures as quickly as it can. We snapshot the percpu counters before and + * after this operation and use the difference in counter values to guess at + * our tolerance for mismatch between expected and actual counter values. + */ + +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; + bool frozen; +}; + +/* + * Since the expected value computation is lockless but only browses incore + * values, the percpu counters should be fairly close to each other. However, + * we'll allow ourselves to be off by at least this (arbitrary) amount. + */ +#define XCHK_FSCOUNT_MIN_VARIANCE (512) + +/* + * Make sure the per-AG structure has been initialized from the on-disk header + * contents and trust that the incore counters match the ondisk counters. (The + * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the + * summary counters after checking all AG headers). Do this from the setup + * function so that the inner AG aggregation loop runs as quickly as possible. + * + * This function runs during the setup phase /before/ we start checking any + * metadata. + */ +STATIC int +xchk_fscount_warmup( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp = NULL; + struct xfs_buf *agf_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno; + int error = 0; + + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; + if (xfs_perag_initialised_agi(pag) && + xfs_perag_initialised_agf(pag)) + continue; + + /* Lock both AG headers. */ + error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + if (error) + break; + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); + if (error) + break; + + /* + * These are supposed to be initialized by the header read + * function. + */ + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { + error = -EFSCORRUPTED; + break; + } + + xfs_buf_relse(agf_bp); + agf_bp = NULL; + xfs_buf_relse(agi_bp); + agi_bp = NULL; + } + + if (agf_bp) + xfs_buf_relse(agf_bp); + if (agi_bp) + xfs_buf_relse(agi_bp); + if (pag) + xfs_perag_rele(pag); + return error; +} + +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + +int +xchk_setup_fscounters( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc; + int error; + + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; + fsc = sc->buf; + fsc->sc = sc; + + xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); + + /* We must get the incore counters set up before we can proceed. */ + error = xchk_fscount_warmup(sc); + if (error) + return error; + + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + +/* Count free space btree blocks manually for pre-lazysbcount filesystems. */ +static int +xchk_fscount_btreeblks( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc, + xfs_agnumber_t agno) +{ + xfs_extlen_t blocks; + int error; + + error = xchk_ag_init_existing(sc, agno, &sc->sa); + if (error) + goto out_free; + + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + +out_free: + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* + * Calculate what the global in-core counters ought to be from the incore + * per-AG structure. Callers can compare this to the actual in-core counters + * to estimate by how much both in-core and on-disk counters need to be + * adjusted. + */ +STATIC int +xchk_fscount_aggregate_agcounts( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + uint64_t delayed; + xfs_agnumber_t agno; + int tries = 8; + int error = 0; + +retry: + fsc->icount = 0; + fsc->ifree = 0; + fsc->fdblocks = 0; + + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; + + /* This somehow got unset since the warmup? */ + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { + error = -EFSCORRUPTED; + break; + } + + /* Count all the inodes */ + fsc->icount += pag->pagi_count; + fsc->ifree += pag->pagi_freecount; + + /* Add up the free/freelist/bnobt/cntbt blocks */ + fsc->fdblocks += pag->pagf_freeblks; + fsc->fdblocks += pag->pagf_flcount; + if (xfs_has_lazysbcount(sc->mp)) { + fsc->fdblocks += pag->pagf_btreeblks; + } else { + error = xchk_fscount_btreeblks(sc, fsc, agno); + if (error) + break; + } + + /* + * Per-AG reservations are taken out of the incore counters, + * so they must be left out of the free blocks computation. + */ + fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; + fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; + + } + if (pag) + xfs_perag_rele(pag); + if (error) { + xchk_set_incomplete(sc); + return error; + } + + /* + * The global incore space reservation is taken from the incore + * counters, so leave that out of the computation. + */ + fsc->fdblocks -= mp->m_resblks_avail; + + /* + * Delayed allocation reservations are taken out of the incore counters + * but not recorded on disk, so leave them and their indlen blocks out + * of the computation. + */ + delayed = percpu_counter_sum(&mp->m_delalloc_blks); + fsc->fdblocks -= delayed; + + trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, + delayed); + + + /* Bail out if the values we compute are totally nonsense. */ + if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || + fsc->fdblocks > mp->m_sb.sb_dblocks || + fsc->ifree > fsc->icount_max) + return -EFSCORRUPTED; + + /* + * If ifree > icount then we probably had some perturbation in the + * counters while we were calculating things. We'll try a few times + * to maintain ifree <= icount before giving up. + */ + if (fsc->ifree > fsc->icount) { + if (tries--) + goto retry; + return -EDEADLOCK; + } + + return 0; +} + +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + int error; + + fsc->frextents = 0; + if (!xfs_has_realtime(mp)) + return 0; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, + xchk_fscount_add_frextent, fsc); + if (error) { + xchk_set_incomplete(sc); + goto out_unlock; + } + +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + return 0; +} +#endif /* CONFIG_XFS_RT */ + +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ + +/* + * Is the @counter reasonably close to the @expected value? + * + * We neither locked nor froze anything in the filesystem while aggregating the + * per-AG data to compute the @expected value, which means that the counter + * could have changed. We know the @old_value of the summation of the counter + * before the aggregation, and we re-sum the counter now. If the expected + * value falls between the two summations, we're ok. + * + * Otherwise, we /might/ have a problem. If the change in the summations is + * more than we want to tolerate, the filesystem is probably busy and we should + * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. + */ +static inline bool +xchk_fscount_within_range( + struct xfs_scrub *sc, + const int64_t old_value, + struct percpu_counter *counter, + uint64_t expected) +{ + int64_t min_value, max_value; + int64_t curr_value = percpu_counter_sum(counter); + + trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, + old_value); + + /* Negative values are always wrong. */ + if (curr_value < 0) + return false; + + /* Exact matches are always ok. */ + if (curr_value == expected) + return true; + + min_value = min(old_value, curr_value); + max_value = max(old_value, curr_value); + + /* Within the before-and-after range is ok. */ + if (expected >= min_value && expected <= max_value) + return true; + + /* Everything else is bad. */ + return false; +} + +/* Check the superblock counters. */ +int +xchk_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; + int error; + + /* Snapshot the percpu counters. */ + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + frextents = percpu_counter_sum(&mp->m_frextents); + + /* No negative values, please! */ + if (icount < 0 || ifree < 0) + xchk_set_corrupt(sc); + + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + + /* See if icount is obviously wrong. */ + if (icount < fsc->icount_min || icount > fsc->icount_max) + xchk_set_corrupt(sc); + + /* See if fdblocks is obviously wrong. */ + if (fdblocks > mp->m_sb.sb_dblocks) + xchk_set_corrupt(sc); + + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + + /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. + */ + if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) + xchk_set_corrupt(sc); + + /* Walk the incore AG headers to calculate the expected counters. */ + error = xchk_fscount_aggregate_agcounts(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c new file mode 100644 index 0000000000..5e2b09ed6e --- /dev/null +++ b/fs/xfs/scrub/health.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_health.h" +#include "scrub/scrub.h" +#include "scrub/health.h" + +/* + * Scrub and In-Core Filesystem Health Assessments + * =============================================== + * + * Online scrub and repair have the time and the ability to perform stronger + * checks than we can do from the metadata verifiers, because they can + * cross-reference records between data structures. Therefore, scrub is in a + * good position to update the online filesystem health assessments to reflect + * the good/bad state of the data structure. + * + * We therefore extend scrub in the following ways to achieve this: + * + * 1. Create a "sick_mask" field in the scrub context. When we're setting up a + * scrub call, set this to the default XFS_SICK_* flag(s) for the selected + * scrub type (call it A). Scrub and repair functions can override the default + * sick_mask value if they choose. + * + * 2. If the scrubber returns a runtime error code, we exit making no changes + * to the incore sick state. + * + * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore + * sick flags before exiting. + * + * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore + * sick flags. If the user didn't want to repair then we exit, leaving the + * metadata structure unfixed and the sick flag set. + * + * 5. Now we know that A is corrupt and the user wants to repair, so run the + * repairer. If the repairer returns an error code, we exit with that error + * code, having made no further changes to the incore sick state. + * + * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean, + * use sick_mask to clear the incore sick flags. This should have the effect + * that A is no longer marked sick. + * + * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and + * use sick_mask to set the incore sick flags. This should have no externally + * visible effect since we already set them in step (4). + * + * There are some complications to this story, however. For certain types of + * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild + * both structures at the same time. The following principles apply to this + * type of repair strategy: + * + * 8. Any repair function that rebuilds multiple structures should update + * sick_mask_visible to reflect whatever other structures are rebuilt, and + * verify that all the rebuilt structures can pass a scrub check. The outcomes + * of 5-7 still apply, but with a sick_mask that covers everything being + * rebuilt. + */ + +/* Map our scrub type to a sick mask and a set of health update functions. */ + +enum xchk_health_group { + XHG_FS = 1, + XHG_RT, + XHG_AG, + XHG_INO, +}; + +struct xchk_health_map { + enum xchk_health_group group; + unsigned int sick_mask; +}; + +static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, + [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, + [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, + [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI }, + [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT }, + [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT }, + [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT }, + [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT }, + [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT }, + [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT }, + [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE }, + [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD }, + [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA }, + [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC }, + [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR }, + [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, + [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, + [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, + [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, + [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, +}; + +/* Return the health status mask for this scrub type. */ +unsigned int +xchk_health_mask_for_scrub_type( + __u32 scrub_type) +{ + return type_to_health_flag[scrub_type].sick_mask; +} + +/* + * Update filesystem health assessments based on what we found and did. + * + * If the scrubber finds errors, we mark sick whatever's mentioned in + * sick_mask, no matter whether this is a first scan or an + * evaluation of repair effectiveness. + * + * Otherwise, no direct corruption was found, so mark whatever's in + * sick_mask as healthy. + */ +void +xchk_update_health( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + bool bad; + + if (!sc->sick_mask) + return; + + bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)); + switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_AG: + pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); + if (bad) + xfs_ag_mark_sick(pag, sc->sick_mask); + else + xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_perag_put(pag); + break; + case XHG_INO: + if (!sc->ip) + return; + if (bad) + xfs_inode_mark_sick(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + break; + case XHG_FS: + if (bad) + xfs_fs_mark_sick(sc->mp, sc->sick_mask); + else + xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + break; + case XHG_RT: + if (bad) + xfs_rt_mark_sick(sc->mp, sc->sick_mask); + else + xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + break; + default: + ASSERT(0); + break; + } +} + +/* Is the given per-AG btree healthy enough for scanning? */ +bool +xchk_ag_btree_healthy_enough( + struct xfs_scrub *sc, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + unsigned int mask = 0; + + /* + * We always want the cursor if it's the same type as whatever we're + * scrubbing, even if we already know the structure is corrupt. + * + * Otherwise, we're only interested in the btree for cross-referencing. + * If we know the btree is bad then don't bother, just set XFAIL. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + return true; + mask = XFS_SICK_AG_BNOBT; + break; + case XFS_BTNUM_CNT: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) + return true; + mask = XFS_SICK_AG_CNTBT; + break; + case XFS_BTNUM_INO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + return true; + mask = XFS_SICK_AG_INOBT; + break; + case XFS_BTNUM_FINO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + return true; + mask = XFS_SICK_AG_FINOBT; + break; + case XFS_BTNUM_RMAP: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) + return true; + mask = XFS_SICK_AG_RMAPBT; + break; + case XFS_BTNUM_REFC: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) + return true; + mask = XFS_SICK_AG_REFCNTBT; + break; + default: + ASSERT(0); + return true; + } + + /* + * If we just repaired some AG metadata, sc->sick_mask will reflect all + * the per-AG metadata types that were repaired. Exclude these from + * the filesystem health query because we have not yet updated the + * health status and we want everything to be scanned. + */ + if ((sc->flags & XREP_ALREADY_FIXED) && + type_to_health_flag[sc->sm->sm_type].group == XHG_AG) + mask &= ~sc->sick_mask; + + if (xfs_ag_has_sickness(pag, mask)) { + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + return false; + } + + return true; +} diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h new file mode 100644 index 0000000000..66a273f858 --- /dev/null +++ b/fs/xfs/scrub/health.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_HEALTH_H__ +#define __XFS_SCRUB_HEALTH_H__ + +unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); +void xchk_update_health(struct xfs_scrub *sc); +bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, + xfs_btnum_t btnum); + +#endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c new file mode 100644 index 0000000000..fb7bbf47ae --- /dev/null +++ b/fs/xfs/scrub/ialloc.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "xfs_ag.h" + +/* + * Set us up to scrub inode btrees. + * If we detect a discrepancy between the inobt and the inode, + * try again after forcing logged inode cores out to disk. + */ +int +xchk_setup_ag_iallocbt( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); +} + +/* Inode btree scrubber. */ + +struct xchk_iallocbt { + /* Number of inodes we see while scanning inobt. */ + unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; +}; + +/* + * Does the finobt have a record for this inode with the same hole/free state? + * This is a bit complicated because of the following: + * + * - The finobt need not have a record if all inodes in the inobt record are + * allocated. + * - The finobt need not have a record if all inodes in the inobt record are + * free. + * - The finobt need not have a record if the inobt record says this is a hole. + * This likely doesn't happen in practice. + */ +STATIC int +xchk_inobt_xref_finobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + bool free, + bool hole) +{ + struct xfs_inobt_rec_incore frec; + struct xfs_btree_cur *cur = sc->sa.fino_cur; + bool ffree, fhole; + unsigned int frec_idx, fhole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &frec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's a finobt record; free and hole status must match. */ + frec_idx = agino - frec.ir_startino; + ffree = frec.ir_free & (1ULL << frec_idx); + fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec.ir_holemask & (1U << fhole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* inobt record is fully allocated */ + if (irec->ir_free == 0) + return 0; + + /* inobt record is totally unallocated */ + if (irec->ir_free == XFS_INOBT_ALL_FREE) + return 0; + + /* inobt record says this is a hole */ + if (hole) + return 0; + + /* finobt doesn't care about allocated inodes */ + if (!free) + return 0; + + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; +} + +/* + * Make sure that each inode of this part of an inobt record has the same + * sparse and free status as the finobt. + */ +STATIC void +xchk_inobt_chunk_xref_finobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + unsigned int nr_inodes) +{ + xfs_agino_t i; + unsigned int rec_idx; + int error; + + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT); + + if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) + return; + + for (i = agino, rec_idx = agino - irec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool free, hole; + unsigned int hole_idx; + + free = irec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec->ir_holemask & (1U << hole_idx); + + error = xchk_inobt_xref_finobt(sc, irec, i, free, hole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + } +} + +/* + * Does the inobt have a record for this inode with the same hole/free state? + * The inobt must always have a record if there's a finobt record. + */ +STATIC int +xchk_finobt_xref_inobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *frec, + xfs_agino_t agino, + bool ffree, + bool fhole) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_btree_cur *cur = sc->sa.ino_cur; + bool free, hole; + unsigned int rec_idx, hole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's an inobt record; free and hole status must match. */ + rec_idx = agino - irec.ir_startino; + free = irec.ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec.ir_holemask & (1U << hole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* finobt should never have a record for which the inobt does not */ + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; +} + +/* + * Make sure that each inode of this part of an finobt record has the same + * sparse and free status as the inobt. + */ +STATIC void +xchk_finobt_chunk_xref_inobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *frec, + xfs_agino_t agino, + unsigned int nr_inodes) +{ + xfs_agino_t i; + unsigned int rec_idx; + int error; + + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT); + + if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm)) + return; + + for (i = agino, rec_idx = agino - frec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool ffree, fhole; + unsigned int hole_idx; + + ffree = frec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec->ir_holemask & (1U << hole_idx); + + error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + } +} + +/* Is this chunk worth checking and cross-referencing? */ +STATIC bool +xchk_iallocbt_chunk( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = bs->sc; + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_perag *pag = bs->cur->bc_ag.pag; + xfs_agblock_t agbno; + xfs_extlen_t len; + + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize); + + if (!xfs_verify_agbext(pag, agbno, len)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; + + xchk_xref_is_used_space(sc, agbno, len); + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes); + else + xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes); + xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); + return true; +} + +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ +STATIC int +xchk_iallocbt_check_cluster_ifree( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec, + unsigned int irec_ino, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; + bool freemask_ok; + int error = 0; + + if (xchk_should_terminate(bs->sc, &error)) + return error; + + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse); + if (error == -ENODATA) { + /* Not cached, just read the disk buffer */ + freemask_ok = irec_free ^ !!(dip->di_mode); + if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok) + return -EDEADLOCK; + } else if (error < 0) { + /* + * Inode is only half assembled, or there was an IO error, + * or the verifier failed, so don't bother trying to check. + * The inode scrubber can deal with this. + */ + goto out; + } else { + /* Inode is all there. */ + freemask_ok = irec_free ^ ino_inuse; + } + if (!freemask_ok) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +out: + return 0; +} + +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ +STATIC int +xchk_iallocbt_check_cluster( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) +{ + struct xfs_imap imap; + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + xfs_agblock_t agbno; + unsigned int cluster_index; + uint16_t cluster_mask = 0; + uint16_t ir_holemask; + int error = 0; + + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + M_IGEO(mp)->inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) << + mp->m_sb.sb_inodelog; + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + M_IGEO(mp)->blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &cluster_bp); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + break; + } + + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + igeo->cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + iabt->next_startino += XFS_INODES_PER_CHUNK; + + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; + } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster; +} + +/* Scrub an inobt/finobt record. */ +STATIC int +xchk_iallocbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xchk_iallocbt *iabt = bs->private; + struct xfs_inobt_rec_incore irec; + uint64_t holes; + xfs_agino_t agino; + int holecount; + int i; + int error = 0; + uint16_t holemask; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + agino = irec.ir_startino; + + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + iabt->inodes += irec.ir_count; + + /* Handle non-sparse inodes */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + if (irec.ir_count != XFS_INODES_PER_CHUNK) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_CHUNK)) + goto out; + goto check_clusters; + } + + /* Check each chunk of a sparse inode cluster. */ + holemask = irec.ir_holemask; + holecount = 0; + holes = ~xfs_inobt_irec_to_allocmask(&irec); + if ((holes & irec.ir_free) != holes || + irec.ir_freecount > irec.ir_count) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { + if (holemask & 1) + holecount += XFS_INODES_PER_HOLEMASK_BIT; + else if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_HOLEMASK_BIT)) + goto out; + holemask >>= 1; + agino += XFS_INODES_PER_HOLEMASK_BIT; + } + + if (holecount > XFS_INODES_PER_CHUNK || + holecount + irec.ir_count != XFS_INODES_PER_CHUNK) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + +check_clusters: + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + error = xchk_iallocbt_check_clusters(bs, &irec); + if (error) + goto out; + +out: + return error; +} + +/* + * Make sure the inode btrees are as large as the rmap thinks they are. + * Don't bother if we're missing btree cursors, as we're already corrupt. + */ +STATIC void +xchk_iallocbt_xref_rmap_btreeblks( + struct xfs_scrub *sc, + int which) +{ + xfs_filblks_t blocks; + xfs_extlen_t inobt_blocks = 0; + xfs_extlen_t finobt_blocks = 0; + int error; + + if (!sc->sa.ino_cur || !sc->sa.rmap_cur || + (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) || + xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many inobt blocks as the rmap says. */ + error = xfs_btree_count_blocks(sc->sa.ino_cur, &inobt_blocks); + if (!xchk_process_error(sc, 0, 0, &error)) + return; + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &finobt_blocks); + if (!xchk_process_error(sc, 0, 0, &error)) + return; + } + + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INOBT, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != inobt_blocks + finobt_blocks) + xchk_btree_set_corrupt(sc, sc->sa.ino_cur, 0); +} + +/* + * Make sure that the inobt records point to the same number of blocks as + * the rmap says are owned by inodes. + */ +STATIC void +xchk_iallocbt_xref_rmap_inodes( + struct xfs_scrub *sc, + int which, + unsigned long long inodes) +{ + xfs_filblks_t blocks; + xfs_filblks_t inode_blocks; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many inode blocks as the rmap knows about. */ + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INODES, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize); + if (blocks != inode_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* Scrub the inode btrees for some AG. */ +STATIC int +xchk_iallocbt( + struct xfs_scrub *sc, + xfs_btnum_t which) +{ + struct xfs_btree_cur *cur; + struct xchk_iallocbt iabt = { + .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, + }; + int error; + + cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, + &iabt); + if (error) + return error; + + xchk_iallocbt_xref_rmap_btreeblks(sc, which); + + /* + * If we're scrubbing the inode btree, inode_blocks is the number of + * blocks pointed to by all the inode chunk records. Therefore, we + * should compare to the number of inode chunk blocks that the rmap + * knows about. We can't do this for the finobt since it only points + * to inode chunks with free inodes. + */ + if (which == XFS_BTNUM_INO) + xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); + + return error; +} + +int +xchk_inobt( + struct xfs_scrub *sc) +{ + return xchk_iallocbt(sc, XFS_BTNUM_INO); +} + +int +xchk_finobt( + struct xfs_scrub *sc) +{ + return xchk_iallocbt(sc, XFS_BTNUM_FINO); +} + +/* See if an inode btree has (or doesn't have) an inode chunk record. */ +static inline void +xchk_xref_inode_check( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len, + struct xfs_btree_cur **icur, + enum xbtree_recpacking expected) +{ + enum xbtree_recpacking outcome; + int error; + + if (!(*icur) || xchk_skip_xref(sc->sm)) + return; + + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, icur)) + return; + if (outcome != expected) + xchk_btree_xref_set_corrupt(sc, *icur, 0); +} + +/* xref check that the extent is not covered by inodes */ +void +xchk_xref_is_not_inode_chunk( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_EMPTY); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, + XBTREE_RECPACKING_EMPTY); +} + +/* xref check that the extent is covered by inodes */ +void +xchk_xref_is_inode_chunk( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_FULL); +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c new file mode 100644 index 0000000000..59d7912fb7 --- /dev/null +++ b/fs/xfs/scrub/inode.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_ag.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_rmap.h" +#include "xfs_bmap_util.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* Prepare the attached inode for scrubbing. */ +static inline int +xchk_prepare_iscrub( + struct xfs_scrub *sc) +{ + int error; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Install this scrub-by-handle inode and prepare it for scrubbing. */ +static inline int +xchk_install_handle_iscrub( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + int error; + + error = xchk_install_handle_inode(sc, ip); + if (error) + return error; + + return xchk_prepare_iscrub(sc); +} + +/* + * Grab total control of the inode metadata. In the best case, we grab the + * incore inode and take all locks on it. If the incore inode cannot be + * constructed due to corruption problems, lock the AGI so that we can single + * step the loading process to fix everything that can go wrong. + */ +int +xchk_setup_inode( + struct xfs_scrub *sc) +{ + struct xfs_imap imap; + struct xfs_inode *ip; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + /* We want to scan the opened inode, so lock it and exit. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + error = xchk_install_live_inode(sc, ip_in); + if (error) + return error; + + return xchk_prepare_iscrub(sc); + } + + /* Reject internal metadata files and obviously bad inode numbers. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_iscrub(sc, ip); + if (error == -ENOENT) + return error; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_error; + + /* + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is + * a record, and it says this inode is free. + * + * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but + * some other metadata corruption (e.g. inode forks) prevented + * instantiation of the incore inode. Or it could mean the inobt is + * corrupt. + * + * We want to look up this inode in the inobt directly to distinguish + * three different scenarios: (1) the inobt says the inode is free, + * in which case there's nothing to do; (2) the inobt is corrupt so we + * should flag the corruption and exit to userspace to let it fix the + * inobt; and (3) the inobt says the inode is allocated, but loading it + * failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity in + * this AG. Retry the iget in case someone allocated a new inode after + * the first iget failed. + */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the incore inode, so install it and proceed. */ + xchk_trans_cancel(sc); + return xchk_install_handle_iscrub(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; + } + + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt doesn't think this is an allocated inode then we'll + * return ENOENT to signal that the check can be skipped. + * + * If the lookup signals corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters a runtime error, exit to userspace. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; + if (error) + goto out_cancel; + + /* + * The lookup succeeded. Chances are the ondisk inode is corrupt and + * preventing iget from reading it. Retain the scrub transaction and + * the AGI buffer to prevent anyone from allocating or freeing inodes. + * This ensures that we preserve the inconsistency between the inobt + * saying the inode is allocated and the icache being unable to load + * the inode until we can flag the corruption in xchk_inode. The + * scrub function has to note the corruption, since we're not really + * supposed to do that from the setup function. + */ + return 0; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; +} + +/* Inode core */ + +/* Validate di_extsize hint. */ +STATIC void +xchk_inode_extsize( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + xfs_failaddr_t fa; + uint32_t value = be32_to_cpu(dip->di_extsize); + + fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags); + if (fa) + xchk_ino_set_corrupt(sc, ino); + + /* + * XFS allows a sysadmin to change the rt extent size when adding a rt + * section to a filesystem after formatting. If there are any + * directories with extszinherit and rtinherit set, the hint could + * become misaligned with the new rextsize. The verifier doesn't check + * this, because we allow rtinherit directories even without an rt + * device. Flag this as an administrative warning since we will clean + * this up eventually. + */ + if ((flags & XFS_DIFLAG_RTINHERIT) && + (flags & XFS_DIFLAG_EXTSZINHERIT) && + value % sc->mp->m_sb.sb_rextsize > 0) + xchk_ino_set_warning(sc, ino); +} + +/* + * Validate di_cowextsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). + * These functions must be kept in sync with each other. + */ +STATIC void +xchk_inode_cowextsize( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + xfs_failaddr_t fa; + + fa = xfs_inode_validate_cowextsize(sc->mp, + be32_to_cpu(dip->di_cowextsize), mode, flags, + flags2); + if (fa) + xchk_ino_set_corrupt(sc, ino); +} + +/* Make sure the di_flags make sense for the inode. */ +STATIC void +xchk_inode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + struct xfs_mount *mp = sc->mp; + + /* di_flags are all taken, last bit cannot be used */ + if (flags & ~XFS_DIFLAG_ANY) + goto bad; + + /* rt flags require rt device */ + if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) + goto bad; + + /* new rt bitmap flag only valid for rbmino */ + if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino) + goto bad; + + /* directory-only flags */ + if ((flags & (XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS)) && + !S_ISDIR(mode)) + goto bad; + + /* file-only flags */ + if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) && + !S_ISREG(mode)) + goto bad; + + /* filestreams and rt make no sense */ + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) + goto bad; + + return; +bad: + xchk_ino_set_corrupt(sc, ino); +} + +/* Make sure the di_flags2 make sense for the inode. */ +STATIC void +xchk_inode_flags2( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + struct xfs_mount *mp = sc->mp; + + /* Unknown di_flags2 could be from a future kernel */ + if (flags2 & ~XFS_DIFLAG2_ANY) + xchk_ino_set_warning(sc, ino); + + /* reflink flag requires reflink feature */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_has_reflink(mp)) + goto bad; + + /* cowextsize flag is checked w.r.t. mode separately */ + + /* file/dir-only flags */ + if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode))) + goto bad; + + /* file-only flags */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) + goto bad; + + /* realtime and reflink make no sense, currently */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + goto bad; + + /* no bigtime iflag without the bigtime feature */ + if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) + goto bad; + + return; +bad: + xchk_ino_set_corrupt(sc, ino); +} + +static inline void +xchk_dinode_nsec( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + + tv = xfs_inode_from_disk_ts(dip, ts); + if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) + xchk_ino_set_corrupt(sc, ino); +} + +/* Scrub all the ondisk inode fields. */ +STATIC void +xchk_dinode( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + xfs_ino_t ino) +{ + struct xfs_mount *mp = sc->mp; + size_t fork_recs; + unsigned long long isize; + uint64_t flags2; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + prid_t prid; + uint16_t flags; + uint16_t mode; + + flags = be16_to_cpu(dip->di_flags); + if (dip->di_version >= 3) + flags2 = be64_to_cpu(dip->di_flags2); + else + flags2 = 0; + + /* di_mode */ + mode = be16_to_cpu(dip->di_mode); + switch (mode & S_IFMT) { + case S_IFLNK: + case S_IFREG: + case S_IFDIR: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + /* mode is recognized */ + break; + default: + xchk_ino_set_corrupt(sc, ino); + break; + } + + /* v1/v2 fields */ + switch (dip->di_version) { + case 1: + /* + * We autoconvert v1 inodes into v2 inodes on writeout, + * so just mark this inode for preening. + */ + xchk_ino_set_preen(sc, ino); + prid = 0; + break; + case 2: + case 3: + if (dip->di_onlink != 0) + xchk_ino_set_corrupt(sc, ino); + + if (dip->di_mode == 0 && sc->ip) + xchk_ino_set_corrupt(sc, ino); + + if (dip->di_projid_hi != 0 && + !xfs_has_projid32(mp)) + xchk_ino_set_corrupt(sc, ino); + + prid = be16_to_cpu(dip->di_projid_lo); + break; + default: + xchk_ino_set_corrupt(sc, ino); + return; + } + + if (xfs_has_projid32(mp)) + prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; + + /* + * di_uid/di_gid -- -1 isn't invalid, but there's no way that + * userspace could have created that. + */ + if (dip->di_uid == cpu_to_be32(-1U) || + dip->di_gid == cpu_to_be32(-1U)) + xchk_ino_set_warning(sc, ino); + + /* + * project id of -1 isn't supposed to be valid, but the kernel didn't + * always validate that. + */ + if (prid == -1U) + xchk_ino_set_warning(sc, ino); + + /* di_format */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (!S_ISCHR(mode) && !S_ISBLK(mode) && + !S_ISFIFO(mode) && !S_ISSOCK(mode)) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_LOCAL: + if (!S_ISDIR(mode) && !S_ISLNK(mode)) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_EXTENTS: + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_BTREE: + if (!S_ISREG(mode) && !S_ISDIR(mode)) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_UUID: + default: + xchk_ino_set_corrupt(sc, ino); + break; + } + + /* di_[amc]time.nsec */ + xchk_dinode_nsec(sc, ino, dip, dip->di_atime); + xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); + xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); + + /* + * di_size. xfs_dinode_verify checks for things that screw up + * the VFS such as the upper bit being set and zero-length + * symlinks/directories, but we can do more here. + */ + isize = be64_to_cpu(dip->di_size); + if (isize & (1ULL << 63)) + xchk_ino_set_corrupt(sc, ino); + + /* Devices, fifos, and sockets must have zero size */ + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) + xchk_ino_set_corrupt(sc, ino); + + /* Directories can't be larger than the data section size (32G) */ + if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) + xchk_ino_set_corrupt(sc, ino); + + /* Symlinks can't be larger than SYMLINK_MAXLEN */ + if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) + xchk_ino_set_corrupt(sc, ino); + + /* + * Warn if the running kernel can't handle the kinds of offsets + * needed to deal with the file size. In other words, if the + * pagecache can't cache all the blocks in this file due to + * overly large offsets, flag the inode for admin review. + */ + if (isize > mp->m_super->s_maxbytes) + xchk_ino_set_warning(sc, ino); + + /* di_nblocks */ + if (flags2 & XFS_DIFLAG2_REFLINK) { + ; /* nblocks can exceed dblocks */ + } else if (flags & XFS_DIFLAG_REALTIME) { + /* + * nblocks is the sum of data extents (in the rtdev), + * attr extents (in the datadev), and both forks' bmbt + * blocks (in the datadev). This clumsy check is the + * best we can do without cross-referencing with the + * inode forks. + */ + if (be64_to_cpu(dip->di_nblocks) >= + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) + xchk_ino_set_corrupt(sc, ino); + } else { + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) + xchk_ino_set_corrupt(sc, ino); + } + + xchk_inode_flags(sc, dip, ino, mode, flags); + + xchk_inode_extsize(sc, dip, ino, mode, flags); + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + + /* di_nextents */ + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_format) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xchk_ino_set_corrupt(sc, ino); + break; + default: + if (nextents != 0) + xchk_ino_set_corrupt(sc, ino); + break; + } + + /* di_forkoff */ + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + xchk_ino_set_corrupt(sc, ino); + if (naextents != 0 && dip->di_forkoff == 0) + xchk_ino_set_corrupt(sc, ino); + if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) + xchk_ino_set_corrupt(sc, ino); + + /* di_aformat */ + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && + dip->di_aformat != XFS_DINODE_FMT_BTREE) + xchk_ino_set_corrupt(sc, ino); + + /* di_anextents */ + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if (naextents > fork_recs) + xchk_ino_set_corrupt(sc, ino); + break; + case XFS_DINODE_FMT_BTREE: + if (naextents <= fork_recs) + xchk_ino_set_corrupt(sc, ino); + break; + default: + if (naextents != 0) + xchk_ino_set_corrupt(sc, ino); + } + + if (dip->di_version >= 3) { + xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); + xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); + xchk_inode_cowextsize(sc, dip, ino, mode, flags, + flags2); + } +} + +/* + * Make sure the finobt doesn't think this inode is free. + * We don't have to check the inobt ourselves because we got the inode via + * IGET_UNTRUSTED, which checks the inobt for us. + */ +static void +xchk_inode_xref_finobt( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + struct xfs_inobt_rec_incore rec; + xfs_agino_t agino; + int has_record; + int error; + + if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) + return; + + agino = XFS_INO_TO_AGINO(sc->mp, ino); + + /* + * Try to get the finobt record. If we can't get it, then we're + * in good shape. + */ + error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE, + &has_record); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + /* + * Otherwise, make sure this record either doesn't cover this inode, + * or that it does but it's marked present. + */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + return; + + if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)) + xchk_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0); +} + +/* Cross reference the inode fields with the forks. */ +STATIC void +xchk_inode_xref_bmap( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + xfs_extnum_t nextents; + xfs_filblks_t count; + xfs_filblks_t acount; + int error; + + if (xchk_skip_xref(sc->sm)) + return; + + /* Walk all the extents to check nextents/naextents/nblocks. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (!xchk_should_check_xref(sc, &error, NULL)) + return; + if (nextents < xfs_dfork_data_extents(dip)) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (!xchk_should_check_xref(sc, &error, NULL)) + return; + if (nextents != xfs_dfork_attr_extents(dip)) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + + /* Check nblocks against the inode. */ + if (count + acount != be64_to_cpu(dip->di_nblocks)) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_inode_xref( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agno = XFS_INO_TO_AGNO(sc->mp, ino); + agbno = XFS_INO_TO_AGBNO(sc->mp, ino); + + error = xchk_ag_init_existing(sc, agno, &sc->sa); + if (!xchk_xref_process_error(sc, agno, agbno, &error)) + goto out_free; + + xchk_xref_is_used_space(sc, agbno, 1); + xchk_inode_xref_finobt(sc, ino); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); + xchk_inode_xref_bmap(sc, dip); + +out_free: + xchk_ag_free(sc, &sc->sa); +} + +/* + * If the reflink iflag disagrees with a scan for shared data fork extents, + * either flag an error (shared extents w/ no flag) or a preen (flag set w/o + * any shared extents). We already checked for reflink iflag set on a non + * reflink filesystem. + */ +static void +xchk_inode_check_reflink_iflag( + struct xfs_scrub *sc, + xfs_ino_t ino) +{ + struct xfs_mount *mp = sc->mp; + bool has_shared; + int error; + + if (!xfs_has_reflink(mp)) + return; + + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + return; + if (xfs_is_reflink_inode(sc->ip) && !has_shared) + xchk_ino_set_preen(sc, ino); + else if (!xfs_is_reflink_inode(sc->ip) && has_shared) + xchk_ino_set_corrupt(sc, ino); +} + +/* Scrub an inode. */ +int +xchk_inode( + struct xfs_scrub *sc) +{ + struct xfs_dinode di; + int error = 0; + + /* + * If sc->ip is NULL, that means that the setup function called + * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED + * and a NULL inode, so flag the corruption error and return. + */ + if (!sc->ip) { + xchk_ino_set_corrupt(sc, sc->sm->sm_ino); + return 0; + } + + /* Scrub the inode core. */ + xfs_inode_to_disk(sc->ip, &di, 0); + xchk_dinode(sc, &di, sc->ip->i_ino); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* + * Look for discrepancies between file's data blocks and the reflink + * iflag. We already checked the iflag against the file mode when + * we scrubbed the dinode. + */ + if (S_ISREG(VFS_I(sc->ip)->i_mode)) + xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); + + xchk_inode_xref(sc, sc->ip->i_ino, &di); +out: + return error; +} diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c new file mode 100644 index 0000000000..e6155d86f7 --- /dev/null +++ b/fs/xfs/scrub/parent.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/readdir.h" + +/* Set us up to scrub parents. */ +int +xchk_setup_parent( + struct xfs_scrub *sc) +{ + return xchk_setup_inode_contents(sc, 0); +} + +/* Parent pointers */ + +/* Look for an entry in a parent pointing to this inode. */ + +struct xchk_parent_ctx { + struct xfs_scrub *sc; + xfs_nlink_t nlink; +}; + +/* Look for a single entry in a directory pointing to an inode. */ +STATIC int +xchk_parent_actor( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_parent_ctx *spc = priv; + int error = 0; + + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name->name, name->len)) + error = -EFSCORRUPTED; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + if (sc->ip->i_ino == ino) + spc->nlink++; + + if (xchk_should_terminate(spc->sc, &error)) + return error; + + return 0; +} + +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_ilock_dir( + struct xfs_inode *dp) +{ + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) + return 0; + + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_ILOCK_SHARED; + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) + return 0; + + return XFS_ILOCK_EXCL; +} + +/* + * Given the inode number of the alleged parent of the inode being scrubbed, + * try to validate that the parent has exactly one directory entry pointing + * back to the inode being scrubbed. Returns -EAGAIN if we need to revalidate + * the dotdot entry. + */ +STATIC int +xchk_parent_validate( + struct xfs_scrub *sc, + xfs_ino_t parent_ino) +{ + struct xchk_parent_ctx spc = { + .sc = sc, + .nlink = 0, + }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *dp = NULL; + xfs_nlink_t expected_nlink; + unsigned int lock_mode; + int error = 0; + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* '..' must not point to ourselves. */ + if (sc->ip->i_ino == parent_ino) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * If we're an unlinked directory, the parent /won't/ have a link + * to us. Otherwise, it should have one link. + */ + expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; + + /* + * Grab the parent directory inode. This must be released before we + * cancel the scrub transaction. + * + * If _iget returns -EINVAL or -ENOENT then the parent inode number is + * garbage and the directory is corrupt. If the _iget returns + * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a + * cross referencing error. Any other error is an operational error. + */ + error = xchk_iget(sc, parent_ino, &dp); + if (error == -EINVAL || error == -ENOENT) { + error = -EFSCORRUPTED; + xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + return error; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_rele; + } + + lock_mode = xchk_parent_ilock_dir(dp); + if (!lock_mode) { + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = -EAGAIN; + goto out_rele; + } + + /* Look for a directory entry in the parent pointing to the child. */ + error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + + /* + * Ensure that the parent has as many links to the child as the child + * thinks it has to the parent. + */ + if (spc.nlink != expected_nlink) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + +out_unlock: + xfs_iunlock(dp, lock_mode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* Scrub a parent pointer. */ +int +xchk_parent( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_ino_t parent_ino; + int error = 0; + + /* + * If we're a directory, check that the '..' link points up to + * a directory that has one entry pointing to us. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* We're not a special inode, are we? */ + if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + do { + if (xchk_should_terminate(sc, &error)) + break; + + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(mp, parent_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * Check that the dotdot entry points to a parent directory + * containing a dirent pointing to this subdirectory. + */ + error = xchk_parent_validate(sc, parent_ino); + } while (error == -EAGAIN); + + return error; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c new file mode 100644 index 0000000000..5671c81534 --- /dev/null +++ b/fs/xfs/scrub/quota.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +/* Convert a scrub type code to a DQ flag, or return 0 if error. */ +static inline xfs_dqtype_t +xchk_quota_to_dqtype( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_UQUOTA: + return XFS_DQTYPE_USER; + case XFS_SCRUB_TYPE_GQUOTA: + return XFS_DQTYPE_GROUP; + case XFS_SCRUB_TYPE_PQUOTA: + return XFS_DQTYPE_PROJ; + default: + return 0; + } +} + +/* Set us up to scrub a quota. */ +int +xchk_setup_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + if (!XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; + + dqtype = xchk_quota_to_dqtype(sc); + if (dqtype == 0) + return -EINVAL; + + if (!xfs_this_quota_on(sc->mp, dqtype)) + return -ENOENT; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + error = xchk_setup_fs(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype)); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Quotas. */ + +struct xchk_quota_info { + struct xfs_scrub *sc; + xfs_dqid_t last_id; +}; + +/* Scrub the fields in an individual quota item. */ +STATIC int +xchk_quota_item( + struct xfs_dquot *dq, + xfs_dqtype_t dqtype, + void *priv) +{ + struct xchk_quota_info *sqi = priv; + struct xfs_scrub *sc = sqi->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset; + xfs_ino_t fs_icount; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Except for the root dquot, the actual dquot we got must either have + * the same or higher id as we saw before. + */ + offset = dq->q_id / qi->qi_dqperchunk; + if (dq->q_id && dq->q_id <= sqi->last_id) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + sqi->last_id = dq->q_id; + + /* + * Warn if the hard limits are larger than the fs. + * Administrators can do this, though in production this seems + * suspect, which is why we flag it for review. + * + * Complain about corruption if the soft limit is greater than + * the hard limit. + */ + if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Check the resource counts. */ + fs_icount = percpu_counter_sum(&mp->m_icount); + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. + */ + if (xfs_has_reflink(mp)) { + if (mp->m_sb.sb_dblocks < dq->q_blk.count) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, + offset); + } else { + if (mp->m_sb.sb_dblocks < dq->q_blk.count) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); + } + if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * We can violate the hard limits if the admin suddenly sets a + * lower limit than the actual usage. However, we flag it for + * admin review. + */ + if (dq->q_id == 0) + goto out; + + if (dq->q_blk.hardlimit != 0 && + dq->q_blk.count > dq->q_blk.hardlimit) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + if (dq->q_ino.hardlimit != 0 && + dq->q_ino.count > dq->q_ino.hardlimit) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + if (dq->q_rtb.hardlimit != 0 && + dq->q_rtb.count > dq->q_rtb.hardlimit) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; +} + +/* Check the quota's data fork. */ +STATIC int +xchk_quota_data_fork( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + int error = 0; + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (xchk_should_terminate(sc, &error)) + break; + + /* + * delalloc/unwritten extents or blocks mapped above the highest + * quota id shouldn't happen. + */ + if (!xfs_bmap_is_written_extent(&irec) || + irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + break; + } + } + + return error; +} + +/* Scrub all of a quota type's items. */ +int +xchk_quota( + struct xfs_scrub *sc) +{ + struct xchk_quota_info sqi; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_dqtype_t dqtype; + int error = 0; + + dqtype = xchk_quota_to_dqtype(sc); + + /* Look for problem extents. */ + error = xchk_quota_data_fork(sc); + if (error) + goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* + * Check all the quota items. Now that we've checked the quota inode + * data fork we have to drop ILOCK_EXCL to use the regular dquot + * functions. + */ + xchk_iunlock(sc, sc->ilock_flags); + sqi.sc = sc; + sqi.last_id = 0; + error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (error == -ECANCELED) + error = 0; + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, + sqi.last_id * qi->qi_dqperchunk, &error)) + goto out; + +out: + return error; +} diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c new file mode 100644 index 0000000000..e51c1544be --- /dev/null +++ b/fs/xfs/scrub/readdir.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/readdir.h" + +/* Call a function for every entry in a shortform directory. */ +STATIC int +xchk_dir_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_name name = { + .name = ".", + .len = 1, + .type = XFS_DIR3_FT_DIR, + }; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_hdr *sfp; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + unsigned int i; + int error; + + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + + /* dot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset); + + error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv); + if (error) + return error; + + /* dotdot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); + ino = xfs_dir2_sf_get_parent_ino(sfp); + name.name = ".."; + name.len = 2; + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + /* iterate everything else */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + name.name = sfep->name; + name.len = sfep->namelen; + name.type = xfs_dir2_sf_get_ftype(mp, sfep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); + } + + return 0; +} + +/* Call a function for every entry in a block directory. */ +STATIC int +xchk_dir_walk_block( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp; + unsigned int off, next_off, end; + int error; + + error = xfs_dir3_block_read(sc->tp, dp, &bp); + if (error) + return error; + + /* Walk each directory entry. */ + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + for (off = geo->data_entry_offset; off < end; off = next_off) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup = bp->b_addr + off; + struct xfs_dir2_data_entry *dep = bp->b_addr + off; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + + /* Skip an empty entry. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + next_off = off + be16_to_cpu(dup->length); + continue; + } + + /* Otherwise, find the next entry and report it. */ + next_off = off + xfs_dir2_data_entsize(mp, dep->namelen); + if (next_off > end) + break; + + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off); + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + } + + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Read a leaf-format directory buffer. */ +STATIC int +xchk_read_leaf_dir_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_da_geometry *geo, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec map; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_dablk_t last_da; + xfs_dablk_t map_off; + xfs_dir2_off_t new_off; + + *bpp = NULL; + + /* + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. + */ + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff)); + + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) + return 0; + if (map.br_startoff >= last_da) + return 0; + xfs_trim_extent(&map, map_off, last_da - map_off); + + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *curoff) + *curoff = new_off; + + return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); +} + +/* Call a function for every entry in a leaf directory. */ +STATIC int +xchk_dir_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp = NULL; + xfs_dir2_off_t curoff = 0; + unsigned int offset = 0; + int error; + + /* Iterate every directory offset in this directory. */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_entry *dep; + xfs_ino_t ino; + unsigned int length; + xfs_dir2_dataptr_t dapos; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || offset >= geo->blksize) { + if (bp) { + xfs_trans_brelse(sc->tp, bp); + bp = NULL; + } + + error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff, + &bp); + if (error || !bp) + break; + + /* + * Find our position in the block. + */ + offset = geo->data_entry_offset; + curoff += geo->data_entry_offset; + } + + /* Skip an empty entry. */ + dup = bp->b_addr + offset; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + offset += length; + curoff += length; + continue; + } + + /* Otherwise, find the next entry and report it. */ + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + + dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + + /* Advance to the next entry. */ + offset += length; + curoff += length; + } + + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* + * Call a function for every entry in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. + */ +int +xchk_dir_walk( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + }; + bool isblock; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) + return xchk_dir_walk_block(sc, dp, dirent_fn, priv); + + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); +} + +/* + * Look up the inode number for an exact name in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. Names are not + * checked for correctness. + */ +int +xchk_dir_lookup( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *ino) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + .name = name->name, + .namelen = name->len, + .filetype = name->type, + .hashval = xfs_dir2_hashname(dp->i_mount, name), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + bool isblock, isleaf; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_lookup(&args); + goto out_check_rval; + } + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) { + error = xfs_dir2_block_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_isleaf(&args, &isleaf); + if (error) + return error; + + if (isleaf) { + error = xfs_dir2_leaf_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_node_lookup(&args); + +out_check_rval: + if (error == -EEXIST) + error = 0; + if (!error) + *ino = args.inumber; + return error; +} diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h new file mode 100644 index 0000000000..55787f4df1 --- /dev/null +++ b/fs/xfs/scrub/readdir.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_READDIR_H__ +#define __XFS_SCRUB_READDIR_H__ + +typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, const struct xfs_name *name, + xfs_ino_t ino, void *priv); + +int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, void *priv); + +int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, + const struct xfs_name *name, xfs_ino_t *ino); + +#endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c new file mode 100644 index 0000000000..86a62420e0 --- /dev/null +++ b/fs/xfs/scrub/reap.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/reap.h" + +/* + * Disposal of Blocks from Old Metadata + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (bitmap) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from bitmap. In theory the extents + * remaining in bitmap are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @bitmap, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We must also invalidate any buffers associated with + * @bitmap. + */ + +/* Information about reaping extents after a repair. */ +struct xreap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + + /* If true, roll the transaction before reaping the next extent. */ + bool force_roll; + + /* Number of deferred reaps attached to the current transaction. */ + unsigned int deferred; + + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int invalidated; + + /* Number of deferred reaps queued during the whole reap sequence. */ + unsigned long long total_deferred; +}; + +/* Put a block back on the AGFL. */ +STATIC int +xreap_put_freelist( + struct xfs_scrub *sc, + xfs_agblock_t agbno) +{ + struct xfs_buf *agfl_bp; + int error; + + /* Make sure there's space on the freelist. */ + error = xrep_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + agfl_bp, agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Are there any uncommitted reap operations? */ +static inline bool xreap_dirty(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred) + return true; + if (rs->invalidated) + return true; + if (rs->total_deferred) + return true; + return false; +} + +#define XREAP_MAX_BINVAL (2048) + +/* + * Decide if we want to roll the transaction after reaping an extent. We don't + * want to overrun the transaction reservation, so we prohibit more than + * 128 EFIs per transaction. For the same reason, we limit the number + * of buffer invalidations to 2048. + */ +static inline bool xreap_want_roll(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) + return true; + if (rs->invalidated > XREAP_MAX_BINVAL) + return true; + return false; +} + +static inline void xreap_reset(struct xreap_state *rs) +{ + rs->total_deferred += rs->deferred; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +#define XREAP_MAX_DEFER_CHAIN (2048) + +/* + * Decide if we want to finish the deferred ops that are attached to the scrub + * transaction. We don't want to queue huge chains of deferred ops because + * that can consume a lot of log space and kernel memory. Hence we trigger a + * xfs_defer_finish if there are more than 2048 deferred reap operations or the + * caller did some real work. + */ +static inline bool +xreap_want_defer_finish(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) + return true; + return false; +} + +static inline void xreap_defer_finish_reset(struct xreap_state *rs) +{ + rs->total_deferred = 0; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +/* Try to invalidate the incore buffers for an extent that we're freeing. */ +STATIC void +xreap_agextent_binval( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t agbno_next = agbno + *aglenp; + xfs_agblock_t bno = agbno; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return; + + /* + * If there are incore buffers for these blocks, invalidate them. We + * assume that the lack of any other known owners means that the buffer + * can be locked without risk of deadlocking. The buffer cache cannot + * detect aliasing, so employ nested loops to scan for incore buffers + * of any plausible size. + */ + while (bno < agbno_next) { + xfs_agblock_t fsbcount; + xfs_agblock_t max_fsbs; + + /* + * Max buffer size is the max remote xattr buffer size, which + * is one fs block larger than 64k. + */ + max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); + + for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + struct xfs_buf *bp = NULL; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, agno, bno); + error = xfs_buf_incore(mp->m_ddev_targp, daddr, + XFS_FSB_TO_BB(mp, fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + rs->invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * far we've gotten. + */ + if (rs->invalidated > XREAP_MAX_BINVAL) { + *aglenp -= agbno_next - bno; + goto out; + } + } + + bno++; + } + +out: + trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); +} + +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. AGFL blocks can only be put back one at + * a time. + */ +STATIC int +xreap_agextent_select( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_agblock_t agbno_next, + bool *crosslinked, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_agblock_t bno = agbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* AGFL blocks can only be deal with one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) + goto out_found; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < agbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + +out_found: + *aglenp = len; + trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this AG extent as possible. The + * number of blocks disposed of will be returned in @aglenp. + */ +STATIC int +xreap_agextent_iter( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_fsblock_t fsbno; + int error = 0; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + + rs->force_roll = true; + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, + *aglenp, rs->oinfo); + } + + trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + + /* + * Invalidate as many buffers as we can, starting at agbno. If this + * function sets *aglenp to zero, the transaction is full of logged + * buffer invalidations, so we need to return early so that we can + * roll and retry. + */ + xreap_agextent_binval(rs, agbno, aglenp); + if (*aglenp == 0) { + ASSERT(xreap_want_roll(rs)); + return 0; + } + + /* Put blocks back on the AGFL one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) { + ASSERT(*aglenp == 1); + error = xreap_put_freelist(sc, agbno); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + + /* + * Use deferred frees to get rid of the old btree blocks to try to + * minimize the window in which we could crash and lose the old blocks. + */ + error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + rs->resv, true); + if (error) + return error; + + rs->deferred++; + return 0; +} + +/* + * Break an AG metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. + */ +STATIC int +xreap_agmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agblock_t agbno = fsbno; + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip == NULL); + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + return error; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + return error; + + if (xreap_want_defer_finish(rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xrep_roll_ag_trans(sc); + if (error) + return error; + xreap_reset(rs); + } + + agbno += aglen; + } + + return 0; +} + +/* Dispose of every block of every AG metadata extent in the bitmap. */ +int +xrep_reap_agblocks( + struct xfs_scrub *sc, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip == NULL); + + error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h new file mode 100644 index 0000000000..fe24626af1 --- /dev/null +++ b/fs/xfs/scrub/reap.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_REAP_H__ +#define __XFS_SCRUB_REAP_H__ + +int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +#endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c new file mode 100644 index 0000000000..304ea1e1bf --- /dev/null +++ b/fs/xfs/scrub/refcount.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub reference count btrees. + */ +int +xchk_setup_ag_refcountbt( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); +} + +/* Reference count btree scrubber. */ + +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xchk_refcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xchk_refcnt_check { + struct xfs_scrub *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_agblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the refcountbt says we should have. + */ +STATIC int +xchk_refcountbt_rmap_check( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_refcnt_check *refchk = priv; + struct xchk_refcnt_frag *frag; + xfs_agblock_t rm_last; + xfs_agblock_t rc_last; + int error = 0; + + if (xchk_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xchk_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmalloc(sizeof(struct xchk_refcnt_frag), + XCHK_GFP_FLAGS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the refcountbt's refcount minus the + * number of extents that totally covered the refcountbt extent), + * we have a refcountbt error. + */ +STATIC void +xchk_refcountbt_process_rmap_fragments( + struct xchk_refcnt_check *refchk) +{ + struct list_head worklist; + struct xchk_refcnt_frag *frag; + struct xchk_refcnt_frag *n; + xfs_agblock_t bno; + xfs_agblock_t rbno; + xfs_agblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLAGBLOCK; + + /* Make sure the fragments actually /are/ in agbno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + nr = 0; + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) + break; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLAGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kfree(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kfree(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xchk_refcountbt_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + struct xchk_refcnt_check refchk = { + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xchk_refcnt_frag *frag; + struct xchk_refcnt_frag *n; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = irec->rc_startblock; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + &xchk_refcountbt_rmap_check, &refchk); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + goto out_free; + + xchk_refcountbt_process_rmap_fragments(&refchk); + if (irec->rc_refcount != refchk.seen) { + trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen); + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + } + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_refcountbt_xref( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount); + xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock, + irec->rc_blockcount); + xchk_refcountbt_xref_rmap(sc, irec); +} + +struct xchk_refcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next AG block where we aren't expecting shared extents. */ + xfs_agblock_t next_unshared_agbno; + + /* Number of CoW blocks we expect. */ + xfs_agblock_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +STATIC int +xchk_refcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_agblock_t *next_bno = priv; + + if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_refcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_refcbt_records *rrc, + xfs_agblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_agblock_t next_bno = NULLAGBLOCK; + int error; + + if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_agbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + xchk_refcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur); +} + +static inline bool +xchk_refcount_mergeable( + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_domain != r2->rc_domain) + return false; + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + MAXREFCEXTLEN) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_refcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_refcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + +/* Scrub a refcountbt record. */ +STATIC int +xchk_refcountbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_refcount_irec irec; + struct xchk_refcbt_records *rrc = bs->private; + + xfs_refcount_btrec_to_irec(rec, &irec); + if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + rrc->cow_blocks += irec.rc_blockcount; + + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + + xchk_refcountbt_check_mergeable(bs, rrc, &irec); + xchk_refcountbt_xref(bs->sc, &irec); + + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_agbno = irec.rc_startblock + + irec.rc_blockcount; + } + + return 0; +} + +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xchk_refcount_xref_rmap( + struct xfs_scrub *sc, + xfs_filblks_t cow_blocks) +{ + xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks); + if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error)) + return; + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_REFC, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != cow_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* Scrub the refcount btree for some AG. */ +int +xchk_refcountbt( + struct xfs_scrub *sc) +{ + struct xchk_refcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_agbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; + int error; + + error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, + &XFS_RMAP_OINFO_REFC, &rrc); + if (error) + return error; + + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the AG have at most one reverse mapping. + */ + xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks); + + xchk_refcount_xref_rmap(sc, rrc.cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the refcountbt. */ +void +xchk_xref_is_cow_staging( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + int has_refcount; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xchk_xref_is_not_shared( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_cow_staging( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c new file mode 100644 index 0000000000..1b8b5439f2 --- /dev/null +++ b/fs/xfs/scrub/repair.c @@ -0,0 +1,736 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/stats.h" + +/* + * Attempt to repair some metadata, if the metadata is corrupt and userspace + * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", + * and will set *fixed to true if it thinks it repaired anything. + */ +int +xrep_attempt( + struct xfs_scrub *sc, + struct xchk_stats_run *run) +{ + u64 repair_start; + int error = 0; + + trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); + + xchk_ag_btcur_free(&sc->sa); + + /* Repair whatever's broken. */ + ASSERT(sc->ops->repair); + run->repair_attempted = true; + repair_start = xchk_stats_now(); + error = sc->ops->repair(sc); + trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); + run->repair_ns += xchk_stats_elapsed_ns(repair_start); + switch (error) { + case 0: + /* + * Repair succeeded. Commit the fixes and perform a second + * scrub so that we can tell userspace if we fixed the problem. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->flags |= XREP_ALREADY_FIXED; + run->repair_succeeded = true; + return -EAGAIN; + case -ECHRNG: + sc->flags |= XCHK_NEED_DRAIN; + run->retries++; + return -EAGAIN; + case -EDEADLOCK: + /* Tell the caller to try again having grabbed all the locks. */ + if (!(sc->flags & XCHK_TRY_HARDER)) { + sc->flags |= XCHK_TRY_HARDER; + run->retries++; + return -EAGAIN; + } + /* + * We tried harder but still couldn't grab all the resources + * we needed to fix it. The corruption has not been fixed, + * so exit to userspace with the scan's output flags unchanged. + */ + return 0; + default: + /* + * EAGAIN tells the caller to re-scrub, so we cannot return + * that here. + */ + ASSERT(error != -EAGAIN); + return error; + } +} + +/* + * Complain about unfixable problems in the filesystem. We don't log + * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver + * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the + * administrator isn't running xfs_scrub in no-repairs mode. + * + * Use this helper function because _ratelimited silently declares a static + * structure to track rate limiting information. + */ +void +xrep_failure( + struct xfs_mount *mp) +{ + xfs_alert_ratelimited(mp, +"Corruption not fixed during online repair. Unmount and run xfs_repair."); +} + +/* + * Repair probe -- userspace uses this to probe if we're willing to repair a + * given mountpoint. + */ +int +xrep_probe( + struct xfs_scrub *sc) +{ + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* + * Roll a transaction, keeping the AG headers locked and reinitializing + * the btree cursors. + */ +int +xrep_roll_ag_trans( + struct xfs_scrub *sc) +{ + int error; + + /* + * Keep the AG header buffers locked while we roll the transaction. + * Ensure that both AG buffers are dirty and held when we roll the + * transaction so that they move forward in the log without losing the + * bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } + + /* + * Roll the transaction. We still hold the AG header buffers locked + * regardless of whether or not that succeeds. On failure, the buffers + * will be released during teardown on our way out of the kernel. If + * successful, join the buffers to the new transaction and move on. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + /* Join the AG headers to the new transaction. */ + if (sc->sa.agi_bp) + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); + if (sc->sa.agf_bp) + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + + return 0; +} + +/* Finish all deferred work attached to the repair transaction. */ +int +xrep_defer_finish( + struct xfs_scrub *sc) +{ + int error; + + /* + * Keep the AG header buffers locked while we complete deferred work + * items. Ensure that both AG buffers are dirty and held when we roll + * the transaction so that they move forward in the log without losing + * the bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } + + /* + * Finish all deferred work items. We still hold the AG header buffers + * locked regardless of whether or not that succeeds. On failure, the + * buffers will be released during teardown on our way out of the + * kernel. If successful, join the buffers to the new transaction + * and move on. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * Release the hold that we set above because defer_finish won't do + * that for us. The defer roll code redirties held buffers after each + * roll, so the AG header buffers should be ready for logging. + */ + if (sc->sa.agi_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + if (sc->sa.agf_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + + return 0; +} + +/* + * Does the given AG have enough space to rebuild a btree? Neither AG + * reservation can be critical, and we must have enough space (factoring + * in AG reservations) to construct a whole btree. + */ +bool +xrep_ag_has_space( + struct xfs_perag *pag, + xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type) +{ + return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && + !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && + pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; +} + +/* + * Figure out how many blocks to reserve for an AG repair. We calculate the + * worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-AG btree. + */ +xfs_extlen_t +xrep_calc_ag_resblks( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + struct xfs_perag *pag; + struct xfs_buf *bp; + xfs_agino_t icount = NULLAGINO; + xfs_extlen_t aglen = NULLAGBLOCK; + xfs_extlen_t usedlen; + xfs_extlen_t freelen; + xfs_extlen_t bnobt_sz; + xfs_extlen_t inobt_sz; + xfs_extlen_t rmapbt_sz; + xfs_extlen_t refcbt_sz; + int error; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + + pag = xfs_perag_get(mp, sm->sm_agno); + if (xfs_perag_initialised_agi(pag)) { + /* Use in-core icount if possible. */ + icount = pag->pagi_count; + } else { + /* Try to get the actual counters from disk. */ + error = xfs_ialloc_read_agi(pag, NULL, &bp); + if (!error) { + icount = pag->pagi_count; + xfs_buf_relse(bp); + } + } + + /* Now grab the block counters from the AGF. */ + error = xfs_alloc_read_agf(pag, NULL, 0, &bp); + if (error) { + aglen = pag->block_count; + freelen = aglen; + usedlen = aglen; + } else { + struct xfs_agf *agf = bp->b_addr; + + aglen = be32_to_cpu(agf->agf_length); + freelen = be32_to_cpu(agf->agf_freeblks); + usedlen = aglen - freelen; + xfs_buf_relse(bp); + } + + /* If the icount is impossible, make some worst-case assumptions. */ + if (icount == NULLAGINO || + !xfs_verify_agino(pag, icount)) { + icount = pag->agino_max - pag->agino_min + 1; + } + + /* If the block counts are impossible, make worst-case assumptions. */ + if (aglen == NULLAGBLOCK || + aglen != pag->block_count || + freelen >= aglen) { + aglen = pag->block_count; + freelen = aglen; + usedlen = aglen; + } + xfs_perag_put(pag); + + trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, + freelen, usedlen); + + /* + * Figure out how many blocks we'd need worst case to rebuild + * each type of btree. Note that we can only rebuild the + * bnobt/cntbt or inobt/finobt as pairs. + */ + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); + if (xfs_has_sparseinodes(mp)) + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_HOLEMASK_BIT); + else + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_CHUNK); + if (xfs_has_finobt(mp)) + inobt_sz *= 2; + if (xfs_has_reflink(mp)) + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); + else + refcbt_sz = 0; + if (xfs_has_rmapbt(mp)) { + /* + * Guess how many blocks we need to rebuild the rmapbt. + * For non-reflink filesystems we can't have more records than + * used blocks. However, with reflink it's possible to have + * more than one rmap record per AG block. We don't know how + * many rmaps there could be in the AG, so we start off with + * what we hope is an generous over-estimation. + */ + if (xfs_has_reflink(mp)) + rmapbt_sz = xfs_rmapbt_calc_size(mp, + (unsigned long long)aglen * 2); + else + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); + } else { + rmapbt_sz = 0; + } + + trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, + inobt_sz, rmapbt_sz, refcbt_sz); + + return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); +} + +/* + * Reconstructing per-AG Btrees + * + * When a space btree is corrupt, we don't bother trying to fix it. Instead, + * we scan secondary space metadata to derive the records that should be in + * the damaged btree, initialize a fresh btree root, and insert the records. + * Note that for rebuilding the rmapbt we scan all the primary data to + * generate the new records. + * + * However, that leaves the matter of removing all the metadata describing the + * old broken structure. For primary metadata we use the rmap data to collect + * every extent with a matching rmap owner (bitmap); we then iterate all other + * metadata structures with the same rmap owner to collect the extents that + * cannot be removed (sublist). We then subtract sublist from bitmap to + * derive the blocks that were used by the old btree. These blocks can be + * reaped. + * + * For rmapbt reconstructions we must use different tactics for extent + * collection. First we iterate all primary metadata (this excludes the old + * rmapbt, obviously) to generate new rmap records. The gaps in the rmap + * records are collected as bitmap. The bnobt records are collected as + * sublist. As with the other btrees we subtract sublist from bitmap, and the + * result (since the rmapbt lives in the free space) are the blocks from the + * old rmapbt. + */ + +/* Ensure the freelist is the correct size. */ +int +xrep_fix_freelist( + struct xfs_scrub *sc, + bool can_shrink) +{ + struct xfs_alloc_arg args = {0}; + + args.mp = sc->mp; + args.tp = sc->tp; + args.agno = sc->sa.pag->pag_agno; + args.alignment = 1; + args.pag = sc->sa.pag; + + return xfs_alloc_fix_freelist(&args, + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); +} + +/* + * Finding per-AG Btree Roots for AGF/AGI Reconstruction + * + * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild + * the AG headers by using the rmap data to rummage through the AG looking for + * btree roots. This is not guaranteed to work if the AG is heavily damaged + * or the rmap data are corrupt. + * + * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL + * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the + * AGI is being rebuilt. It must maintain these locks until it's safe for + * other threads to change the btrees' shapes. The caller provides + * information about the btrees to look for by passing in an array of + * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. + * The (root, height) fields will be set on return if anything is found. The + * last element of the array should have a NULL buf_ops to mark the end of the + * array. + * + * For every rmapbt record matching any of the rmap owners in btree_info, + * read each block referenced by the rmap record. If the block is a btree + * block from this filesystem matching any of the magic numbers and has a + * level higher than what we've already seen, remember the block and the + * height of the tree required to have such a block. When the call completes, + * we return the highest block we've found for each btree description; those + * should be the roots. + */ + +struct xrep_findroot { + struct xfs_scrub *sc; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + struct xrep_find_ag_btree *btree_info; +}; + +/* See if our block is in the AGFL. */ +STATIC int +xrep_findroot_agfl_walk( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + xfs_agblock_t *agbno = priv; + + return (*agbno == bno) ? -ECANCELED : 0; +} + +/* Does this block match the btree information passed in? */ +STATIC int +xrep_findroot_block( + struct xrep_findroot *ri, + struct xrep_find_ag_btree *fab, + uint64_t owner, + xfs_agblock_t agbno, + bool *done_with_block) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_buf *bp; + struct xfs_btree_block *btblock; + xfs_daddr_t daddr; + int block_level; + int error = 0; + + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); + + /* + * Blocks in the AGFL have stale contents that might just happen to + * have a matching magic and uuid. We don't want to pull these blocks + * in as part of a tree root, so we have to filter out the AGFL stuff + * here. If the AGFL looks insane we'll just refuse to repair. + */ + if (owner == XFS_RMAP_OWN_AG) { + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, + xrep_findroot_agfl_walk, &agbno); + if (error == -ECANCELED) + return 0; + if (error) + return error; + } + + /* + * Read the buffer into memory so that we can see if it's a match for + * our btree type. We have no clue if it is beforehand, and we want to + * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which + * will cause needless disk reads in subsequent calls to this function) + * and logging metadata verifier failures. + * + * Therefore, pass in NULL buffer ops. If the buffer was already in + * memory from some other caller it will already have b_ops assigned. + * If it was in memory from a previous unsuccessful findroot_block + * call, the buffer won't have b_ops but it should be clean and ready + * for us to try to verify if the read call succeeds. The same applies + * if the buffer wasn't in memory at all. + * + * Note: If we never match a btree type with this buffer, it will be + * left in memory with NULL b_ops. This shouldn't be a problem unless + * the buffer gets written. + */ + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + + /* Ensure the block magic matches the btree type we're looking for. */ + btblock = XFS_BUF_TO_BLOCK(bp); + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) + goto out; + + /* + * If the buffer already has ops applied and they're not the ones for + * this btree type, we know this block doesn't match the btree and we + * can bail out. + * + * If the buffer ops match ours, someone else has already validated + * the block for us, so we can move on to checking if this is a root + * block candidate. + * + * If the buffer does not have ops, nobody has successfully validated + * the contents and the buffer cannot be dirty. If the magic, uuid, + * and structure match this btree type then we'll move on to checking + * if it's a root block candidate. If there is no match, bail out. + */ + if (bp->b_ops) { + if (bp->b_ops != fab->buf_ops) + goto out; + } else { + ASSERT(!xfs_trans_buf_is_dirty(bp)); + if (!uuid_equal(&btblock->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid)) + goto out; + /* + * Read verifiers can reference b_ops, so we set the pointer + * here. If the verifier fails we'll reset the buffer state + * to what it was before we touched the buffer. + */ + bp->b_ops = fab->buf_ops; + fab->buf_ops->verify_read(bp); + if (bp->b_error) { + bp->b_ops = NULL; + bp->b_error = 0; + goto out; + } + + /* + * Some read verifiers will (re)set b_ops, so we must be + * careful not to change b_ops after running the verifier. + */ + } + + /* + * This block passes the magic/uuid and verifier tests for this btree + * type. We don't need the caller to try the other tree types. + */ + *done_with_block = true; + + /* + * Compare this btree block's level to the height of the current + * candidate root block. + * + * If the level matches the root we found previously, throw away both + * blocks because there can't be two candidate roots. + * + * If level is lower in the tree than the root we found previously, + * ignore this block. + */ + block_level = xfs_btree_get_level(btblock); + if (block_level + 1 == fab->height) { + fab->root = NULLAGBLOCK; + goto out; + } else if (block_level < fab->height) { + goto out; + } + + /* + * This is the highest block in the tree that we've found so far. + * Update the btree height to reflect what we've learned from this + * block. + */ + fab->height = block_level + 1; + + /* + * If this block doesn't have sibling pointers, then it's the new root + * block candidate. Otherwise, the root will be found farther up the + * tree. + */ + if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && + btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + fab->root = agbno; + else + fab->root = NULLAGBLOCK; + + trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, + be32_to_cpu(btblock->bb_magic), fab->height - 1); +out: + xfs_trans_brelse(ri->sc->tp, bp); + return error; +} + +/* + * Do any of the blocks in this rmap record match one of the btrees we're + * looking for? + */ +STATIC int +xrep_findroot_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_findroot *ri = priv; + struct xrep_find_ag_btree *fab; + xfs_agblock_t b; + bool done; + int error = 0; + + /* Ignore anything that isn't AG metadata. */ + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + /* Otherwise scan each block + btree type. */ + for (b = 0; b < rec->rm_blockcount; b++) { + done = false; + for (fab = ri->btree_info; fab->buf_ops; fab++) { + if (rec->rm_owner != fab->rmap_owner) + continue; + error = xrep_findroot_block(ri, fab, + rec->rm_owner, rec->rm_startblock + b, + &done); + if (error) + return error; + if (done) + break; + } + } + + return 0; +} + +/* Find the roots of the per-AG btrees described in btree_info. */ +int +xrep_find_ag_btree_roots( + struct xfs_scrub *sc, + struct xfs_buf *agf_bp, + struct xrep_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp) +{ + struct xfs_mount *mp = sc->mp; + struct xrep_findroot ri; + struct xrep_find_ag_btree *fab; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_buf_islocked(agf_bp)); + ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); + + ri.sc = sc; + ri.btree_info = btree_info; + ri.agf = agf_bp->b_addr; + ri.agfl_bp = agfl_bp; + for (fab = btree_info; fab->buf_ops; fab++) { + ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); + ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); + fab->root = NULLAGBLOCK; + fab->height = 0; + } + + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); + xfs_btree_del_cursor(cur, error); + + return error; +} + +/* Force a quotacheck the next time we mount. */ +void +xrep_force_quotacheck( + struct xfs_scrub *sc, + xfs_dqtype_t type) +{ + uint flag; + + flag = xfs_quota_chkd_flag(type); + if (!(flag & sc->mp->m_qflags)) + return; + + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); + sc->mp->m_qflags &= ~flag; + spin_lock(&sc->mp->m_sb_lock); + sc->mp->m_sb.sb_qflags &= ~flag; + spin_unlock(&sc->mp->m_sb_lock); + xfs_log_sb(sc->tp); + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); +} + +/* + * Attach dquots to this inode, or schedule quotacheck to fix them. + * + * This function ensures that the appropriate dquots are attached to an inode. + * We cannot allow the dquot code to allocate an on-disk dquot block here + * because we're already in transaction context with the inode locked. The + * on-disk dquot should already exist anyway. If the quota code signals + * corruption or missing quota information, schedule quotacheck, which will + * repair corruptions in the quota metadata. + */ +int +xrep_ino_dqattach( + struct xfs_scrub *sc) +{ + int error; + + error = xfs_qm_dqattach_locked(sc->ip, false); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + case -ENOENT: + xfs_err_ratelimited(sc->mp, +"inode %llu repair encountered quota error %d, quotacheck forced.", + (unsigned long long)sc->ip->i_ino, error); + if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + fallthrough; + case -ESRCH: + error = 0; + break; + default: + break; + } + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h new file mode 100644 index 0000000000..60d2a9ae5f --- /dev/null +++ b/fs/xfs/scrub/repair.h @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_REPAIR_H__ +#define __XFS_SCRUB_REPAIR_H__ + +#include "xfs_quota_defs.h" + +struct xchk_stats_run; + +static inline int xrep_notsupported(struct xfs_scrub *sc) +{ + return -EOPNOTSUPP; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR + +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + + +/* Repair helpers */ + +int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +void xrep_failure(struct xfs_mount *mp); +int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_defer_finish(struct xfs_scrub *sc); +bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type); +xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); + +struct xbitmap; +struct xagb_bitmap; + +int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); + +struct xrep_find_ag_btree { + /* in: rmap owner of the btree we're looking for */ + uint64_t rmap_owner; + + /* in: buffer ops */ + const struct xfs_buf_ops *buf_ops; + + /* in: maximum btree height */ + unsigned int maxlevels; + + /* out: the highest btree block found and the tree height */ + xfs_agblock_t root; + unsigned int height; +}; + +int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, + struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); +void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); +int xrep_ino_dqattach(struct xfs_scrub *sc); + +/* Metadata repairers */ + +int xrep_probe(struct xfs_scrub *sc); +int xrep_superblock(struct xfs_scrub *sc); +int xrep_agf(struct xfs_scrub *sc); +int xrep_agfl(struct xfs_scrub *sc); +int xrep_agi(struct xfs_scrub *sc); + +#else + +static inline int +xrep_attempt( + struct xfs_scrub *sc, + struct xchk_stats_run *run) +{ + return -EOPNOTSUPP; +} + +static inline void xrep_failure(struct xfs_mount *mp) {} + +static inline xfs_extlen_t +xrep_calc_ag_resblks( + struct xfs_scrub *sc) +{ + return 0; +} + +#define xrep_probe xrep_notsupported +#define xrep_superblock xrep_notsupported +#define xrep_agf xrep_notsupported +#define xrep_agfl xrep_notsupported +#define xrep_agi xrep_notsupported + +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_REPAIR_H__ */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c new file mode 100644 index 0000000000..d29a26ecdd --- /dev/null +++ b/fs/xfs/scrub/rmap.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_ag.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/bitmap.h" + +/* + * Set us up to scrub reverse mapping btrees. + */ +int +xchk_setup_ag_rmapbt( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + return xchk_setup_ag_btree(sc, false); +} + +/* Reverse-mapping scrubber. */ + +struct xchk_rmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; + + /* Bitmaps containing all blocks for each type of AG metadata. */ + struct xagb_bitmap fs_owned; + struct xagb_bitmap log_owned; + struct xagb_bitmap ag_owned; + struct xagb_bitmap inobt_owned; + struct xagb_bitmap refcbt_owned; + + /* Did we complete the AG space metadata bitmaps? */ + bool bitmaps_complete; +}; + +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xchk_rmapbt_xref_refc( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + bool non_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sa.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (flen != 0 && (non_inode || is_attr || is_bmbt || is_unwritten)) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_rmapbt_xref( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t agbno = irec->rm_startblock; + xfs_extlen_t len = irec->rm_blockcount; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_space(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_INODES) + xchk_xref_is_inode_chunk(sc, agbno, len); + else + xchk_xref_is_not_inode_chunk(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xchk_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xchk_rmapbt_xref_refc(sc, irec); +} + +/* + * Check for bogus UNWRITTEN flags in the rmapbt node block keys. + * + * In reverse mapping records, the file mapping extent state + * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not + * involved in lookups in any way. In older kernels, the functions that + * convert rmapbt records to keys forgot to filter out the extent state bit, + * even though the key comparison functions have filtered the flag correctly. + * If we spot an rmap key with the unwritten bit set in rm_offset, we should + * mark the btree as needing optimization to rebuild the btree without those + * flags. + */ +STATIC void +xchk_rmapbt_check_unwritten_in_keyflags( + struct xchk_btree *bs) +{ + struct xfs_scrub *sc = bs->sc; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *keyblock; + union xfs_btree_key *lkey, *hkey; + __be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); + unsigned int level; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + return; + + for (level = 1; level < cur->bc_nlevels; level++) { + struct xfs_buf *bp; + unsigned int ptr; + + /* Only check the first time we've seen this node block. */ + if (cur->bc_levels[level].ptr > 1) + continue; + + keyblock = xfs_btree_get_block(cur, level, &bp); + for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) { + lkey = xfs_btree_key_addr(cur, ptr, keyblock); + + if (lkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + + hkey = xfs_btree_high_key_addr(cur, ptr, keyblock); + if (hkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + } + } +} + +static inline bool +xchk_rmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_reflink(sc->mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return false; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK | + XFS_RMAP_UNWRITTEN)) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_agblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; + + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rmapbt_is_shareable(bs->sc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rmap_mergeable( + struct xchk_rmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner)) + return true; + /* must be an inode owner below here */ + if (r1->rm_flags != r2->rm_flags) + return false; + if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK) + return true; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Compare an rmap for AG metadata against the metadata walk. */ +STATIC int +xchk_rmapbt_mark_bitmap( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + struct xfs_scrub *sc = bs->sc; + struct xagb_bitmap *bmp = NULL; + xfs_extlen_t fsbcount = irec->rm_blockcount; + + /* + * Skip corrupt records. It is essential that we detect records in the + * btree that cannot overlap but do, flag those as CORRUPT, and skip + * the bitmap comparison to avoid generating false XCORRUPT reports. + */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * If the AG metadata walk didn't complete, there's no point in + * comparing against partial results. + */ + if (!cr->bitmaps_complete) + return 0; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + bmp = &cr->fs_owned; + break; + case XFS_RMAP_OWN_LOG: + bmp = &cr->log_owned; + break; + case XFS_RMAP_OWN_AG: + bmp = &cr->ag_owned; + break; + case XFS_RMAP_OWN_INOBT: + bmp = &cr->inobt_owned; + break; + case XFS_RMAP_OWN_REFC: + bmp = &cr->refcbt_owned; + break; + } + + if (!bmp) + return 0; + + if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) { + /* + * The start of this reverse mapping corresponds to a set + * region in the bitmap. If the mapping covers more area than + * the set region, then it covers space that wasn't found by + * the AG metadata walk. + */ + if (fsbcount < irec->rm_blockcount) + xchk_btree_xref_set_corrupt(bs->sc, + bs->sc->sa.rmap_cur, 0); + } else { + /* + * The start of this reverse mapping does not correspond to a + * completely set region in the bitmap. The region wasn't + * fully set by walking the AG metadata, so this is a + * cross-referencing corruption. + */ + xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0); + } + + /* Unset the region so that we can detect missing rmap records. */ + return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount); +} + +/* Scrub an rmapbt record. */ +STATIC int +xchk_rmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rmap *cr = bs->private; + struct xfs_rmap_irec irec; + + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rmap_check_irec(bs->cur, &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + xchk_rmapbt_check_unwritten_in_keyflags(bs); + xchk_rmapbt_check_mergeable(bs, cr, &irec); + xchk_rmapbt_check_overlapping(bs, cr, &irec); + xchk_rmapbt_xref(bs->sc, &irec); + + return xchk_rmapbt_mark_bitmap(bs, cr, &irec); +} + +/* Add an AGFL block to the rmap list. */ +STATIC int +xchk_rmapbt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* + * Set up bitmaps mapping all the AG metadata to compare with the rmapbt + * records. + * + * Grab our own btree cursors here if the scrub setup function didn't give us a + * btree cursor due to reports of poor health. We need to find out if the + * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being + * XCORRUPT. + */ +STATIC int +xchk_rmapbt_walk_ag_metadata( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *cur; + int error; + + /* OWN_FS: AG headers */ + error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp), + XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1); + if (error) + goto out; + + /* OWN_LOG: Internal log */ + if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + error = xagb_bitmap_set(&cr->log_owned, + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), + mp->m_sb.sb_logblocks); + if (error) + goto out; + } + + /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ + cur = sc->sa.bno_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.bno_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + cur = sc->sa.cnt_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.cnt_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur); + if (error) + goto out; + + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto out; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl, + &cr->ag_owned); + xfs_trans_brelse(sc->tp, agfl_bp); + if (error) + goto out; + + /* OWN_INOBT: inobt, finobt */ + cur = sc->sa.ino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, + XFS_BTNUM_INO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.ino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + cur = sc->sa.fino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp, XFS_BTNUM_FINO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.fino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + } + + /* OWN_REFC: refcountbt */ + if (xfs_has_reflink(sc->mp)) { + cur = sc->sa.refc_cur; + if (!cur) + cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp, + sc->sa.agf_bp, sc->sa.pag); + error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur); + if (cur != sc->sa.refc_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + } + +out: + /* + * If there's an error, set XFAIL and disable the bitmap + * cross-referencing checks, but proceed with the scrub anyway. + */ + if (error) + xchk_btree_xref_process_error(sc, sc->sa.rmap_cur, + sc->sa.rmap_cur->bc_nlevels - 1, &error); + else + cr->bitmaps_complete = true; + return 0; +} + +/* + * Check for set regions in the bitmaps; if there are any, the rmap records do + * not describe all the AG metadata. + */ +STATIC void +xchk_rmapbt_check_bitmaps( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_btree_cur *cur = sc->sa.rmap_cur; + unsigned int level; + + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XFAIL)) + return; + if (!cur) + return; + level = cur->bc_nlevels - 1; + + /* + * Any bitmap with bits still set indicates that the reverse mapping + * doesn't cover the entire primary structure. + */ + if (xagb_bitmap_hweight(&cr->fs_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->log_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->ag_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->inobt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); +} + +/* Scrub the rmap btree for some AG. */ +int +xchk_rmapbt( + struct xfs_scrub *sc) +{ + struct xchk_rmap *cr; + int error; + + cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS); + if (!cr) + return -ENOMEM; + + xagb_bitmap_init(&cr->fs_owned); + xagb_bitmap_init(&cr->log_owned); + xagb_bitmap_init(&cr->ag_owned); + xagb_bitmap_init(&cr->inobt_owned); + xagb_bitmap_init(&cr->refcbt_owned); + + error = xchk_rmapbt_walk_ag_metadata(sc, cr); + if (error) + goto out; + + error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, + &XFS_RMAP_OINFO_AG, cr); + if (error) + goto out; + + xchk_rmapbt_check_bitmaps(sc, cr); + +out: + xagb_bitmap_destroy(&cr->refcbt_owned); + xagb_bitmap_destroy(&cr->inobt_owned); + xagb_bitmap_destroy(&cr->ag_owned); + xagb_bitmap_destroy(&cr->log_owned); + xagb_bitmap_destroy(&cr->fs_owned); + kfree(cr); + return error; +} + +/* xref check that the extent is owned only by a given owner */ +void +xchk_xref_is_only_owned_by( + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) +{ + struct xfs_rmap_matches res; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* xref check that the extent is not owned by a given owner */ +void +xchk_xref_is_not_owned_by( + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) +{ + struct xfs_rmap_matches res; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (res.matches != 0) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* xref check that the extent has no reverse mapping at all */ +void +xchk_xref_has_no_owner( + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c new file mode 100644 index 0000000000..008ddb599e --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +/* Set us up with the realtime metadata locked. */ +int +xchk_setup_rtbitmap( + struct xfs_scrub *sc) +{ + int error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + return 0; +} + +/* Realtime bitmap. */ + +/* Scrub a free extent record from the realtime bitmap. */ +STATIC int +xchk_rtbitmap_rec( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub *sc = priv; + xfs_rtblock_t startblock; + xfs_rtblock_t blockcount; + + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; + + if (!xfs_verify_rtext(mp, startblock, blockcount)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; +} + +/* Make sure the entire rtbitmap file is mapped with written extents. */ +STATIC int +xchk_rtbitmap_check_extents( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_bmbt_irec map; + xfs_rtblock_t off; + int nmap; + int error = 0; + + for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rbmip, off, + mp->m_sb.sb_rbmblocks - off, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + break; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + break; + } + + off += map.br_blockcount; + } + + return error; +} + +/* Scrub the realtime bitmap. */ +int +xchk_rtbitmap( + struct xfs_scrub *sc) +{ + int error; + + /* Is the size of the rtbitmap correct? */ + if (sc->mp->m_rbmip->i_disk_size != + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { + xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + return 0; + } + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + error = xchk_rtbitmap_check_extents(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + +out: + return error; +} + +/* xref check that the extent is not free in the rtbitmap */ +void +xchk_xref_is_used_rt_space( + struct xfs_scrub *sc, + xfs_rtblock_t fsbno, + xfs_extlen_t len) +{ + xfs_rtblock_t startext; + xfs_rtblock_t endext; + xfs_rtblock_t extcount; + bool is_free; + int error; + + if (xchk_skip_xref(sc->sm)) + return; + + startext = fsbno; + endext = fsbno + len - 1; + do_div(startext, sc->mp->m_sb.sb_rextsize); + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, + &is_free); + if (!xchk_should_check_xref(sc, &error, NULL)) + goto out_unlock; + if (is_free) + xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c new file mode 100644 index 0000000000..437ed9acbb --- /dev/null +++ b/fs/xfs/scrub/rtsummary.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" + +/* + * Realtime Summary + * ================ + * + * We check the realtime summary by scanning the realtime bitmap file to create + * a new summary file incore, and then we compare the computed version against + * the ondisk version. We use the 'xfile' functionality to store this + * (potentially large) amount of data in pageable memory. + */ + +/* Set us up to check the rtsummary file. */ +int +xchk_setup_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* + * Create an xfile to construct a new rtsummary file. The xfile allows + * us to avoid pinning kernel memory for this purpose. + */ + descr = xchk_xfile_descr(sc, "realtime summary file"); + error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + kfree(descr); + if (error) + return error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + /* Allocate a memory buffer for the summary comparison. */ + sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + error = xchk_install_live_inode(sc, mp->m_rsumip); + if (error) + return error; + + /* + * Locking order requires us to take the rtbitmap first. We must be + * careful to unlock it ourselves when we are done with the rtbitmap + * file since the scrub infrastructure won't do that for us. Only + * then we can lock the rtsummary inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return 0; +} + +/* Helper functions to record suminfo words in an xfile. */ + +typedef unsigned int xchk_rtsumoff_t; + +static inline int +xfsum_load( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info) +{ + return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_store( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + const xfs_suminfo_t info) +{ + return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_copyout( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info, + unsigned int nr_words) +{ + return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + sumoff << XFS_WORDLOG); +} + +/* Update the summary file to reflect the free extent that we've accumulated. */ +STATIC int +xchk_rtsum_record_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub *sc = priv; + xfs_fileoff_t rbmoff; + xfs_rtblock_t rtbno; + xfs_filblks_t rtlen; + xchk_rtsumoff_t offs; + unsigned int lenlog; + xfs_suminfo_t v = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* Compute the relevant location in the rtsum file. */ + rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); + lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + + if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + return -EFSCORRUPTED; + } + + /* Bump the summary count. */ + error = xfsum_load(sc, offs, &v); + if (error) + return error; + + v++; + trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, + lenlog, offs, v); + + return xfsum_store(sc, offs, v); +} + +/* Compute the realtime summary from the realtime bitmap. */ +STATIC int +xchk_rtsum_compute( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long rtbmp_bytes; + + /* If the bitmap size doesn't match the computed size, bail. */ + rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); + if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != + mp->m_rbmip->i_disk_size) + return -EFSCORRUPTED; + + return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, + sc); +} + +/* Compare the rtsummary file against the one we computed. */ +STATIC int +xchk_rtsum_compare( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_bmbt_irec map; + xfs_fileoff_t off; + xchk_rtsumoff_t sumoff = 0; + int nmap; + + for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + return 0; + } + + /* Read a block's worth of ondisk rtsummary file. */ + error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + /* Read a block's worth of computed rtsummary file. */ + error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + if (error) { + xfs_trans_brelse(sc->tp, bp); + return error; + } + + if (memcmp(bp->b_addr, sc->buf, + mp->m_blockwsize << XFS_WORDLOG) != 0) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + + xfs_trans_brelse(sc->tp, bp); + sumoff += mp->m_blockwsize; + } + + return 0; +} + +/* Scrub the realtime summary. */ +int +xchk_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out_rbm; + + /* Construct the new summary file from the rtbitmap. */ + error = xchk_rtsum_compute(sc); + if (error == -EFSCORRUPTED) { + /* + * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref + * error since we're checking the summary file. + */ + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + error = 0; + goto out_rbm; + } + if (error) + goto out_rbm; + + /* Does the computed summary file match the actual rtsummary file? */ + error = xchk_rtsum_compare(sc); + +out_rbm: + /* Unlock the rtbitmap since we're done with it. */ + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c new file mode 100644 index 0000000000..4849efcaa3 --- /dev/null +++ b/fs/xfs/scrub/scrub.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/health.h" +#include "scrub/stats.h" +#include "scrub/xfile.h" + +/* + * Online Scrub and Repair + * + * Traditionally, XFS (the kernel driver) did not know how to check or + * repair on-disk data structures. That task was left to the xfs_check + * and xfs_repair tools, both of which require taking the filesystem + * offline for a thorough but time consuming examination. Online + * scrub & repair, on the other hand, enables us to check the metadata + * for obvious errors while carefully stepping around the filesystem's + * ongoing operations, locking rules, etc. + * + * Given that most XFS metadata consist of records stored in a btree, + * most of the checking functions iterate the btree blocks themselves + * looking for irregularities. When a record block is encountered, each + * record can be checked for obviously bad values. Record values can + * also be cross-referenced against other btrees to look for potential + * misunderstandings between pieces of metadata. + * + * It is expected that the checkers responsible for per-AG metadata + * structures will lock the AG headers (AGI, AGF, AGFL), iterate the + * metadata structure, and perform any relevant cross-referencing before + * unlocking the AG and returning the results to userspace. These + * scrubbers must not keep an AG locked for too long to avoid tying up + * the block and inode allocators. + * + * Block maps and b-trees rooted in an inode present a special challenge + * because they can involve extents from any AG. The general scrubber + * structure of lock -> check -> xref -> unlock still holds, but AG + * locking order rules /must/ be obeyed to avoid deadlocks. The + * ordering rule, of course, is that we must lock in increasing AG + * order. Helper functions are provided to track which AG headers we've + * already locked. If we detect an imminent locking order violation, we + * can signal a potential deadlock, in which case the scrubber can jump + * out to the top level, lock all the AGs in order, and retry the scrub. + * + * For file data (directories, extended attributes, symlinks) scrub, we + * can simply lock the inode and walk the data. For btree data + * (directories and attributes) we follow the same btree-scrubbing + * strategy outlined previously to check the records. + * + * We use a bit of trickery with transactions to avoid buffer deadlocks + * if there is a cycle in the metadata. The basic problem is that + * travelling down a btree involves locking the current buffer at each + * tree level. If a pointer should somehow point back to a buffer that + * we've already examined, we will deadlock due to the second buffer + * locking attempt. Note however that grabbing a buffer in transaction + * context links the locked buffer to the transaction. If we try to + * re-grab the buffer in the context of the same transaction, we avoid + * the second lock attempt and continue. Between the verifier and the + * scrubber, something will notice that something is amiss and report + * the corruption. Therefore, each scrubber will allocate an empty + * transaction, attach buffers to it, and cancel the transaction at the + * end of the scrub run. Cancelling a non-dirty transaction simply + * unlocks the buffers. + * + * There are four pieces of data that scrub can communicate to + * userspace. The first is the error code (errno), which can be used to + * communicate operational errors in performing the scrub. There are + * also three flags that can be set in the scrub context. If the data + * structure itself is corrupt, the CORRUPT flag will be set. If + * the metadata is correct but otherwise suboptimal, the PREEN flag + * will be set. + * + * We perform secondary validation of filesystem metadata by + * cross-referencing every record with all other available metadata. + * For example, for block mapping extents, we verify that there are no + * records in the free space and inode btrees corresponding to that + * space extent and that there is a corresponding entry in the reverse + * mapping btree. Inconsistent metadata is noted by setting the + * XCORRUPT flag; btree query function errors are noted by setting the + * XFAIL flag and deleting the cursor to prevent further attempts to + * cross-reference with a defective btree. + * + * If a piece of metadata proves corrupt or suboptimal, the userspace + * program can ask the kernel to apply some tender loving care (TLC) to + * the metadata object by setting the REPAIR flag and re-calling the + * scrub ioctl. "Corruption" is defined by metadata violating the + * on-disk specification; operations cannot continue if the violation is + * left untreated. It is possible for XFS to continue if an object is + * "suboptimal", however performance may be degraded. Repairs are + * usually performed by rebuilding the metadata entirely out of + * redundant metadata. Optimizing, on the other hand, can sometimes be + * done without rebuilding entire structures. + * + * Generally speaking, the repair code has the following code structure: + * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. + * The first check helps us figure out if we need to rebuild or simply + * optimize the structure so that the rebuild knows what to do. The + * second check evaluates the completeness of the repair; that is what + * is reported to userspace. + * + * A quick note on symbol prefixes: + * - "xfs_" are general XFS symbols. + * - "xchk_" are symbols related to metadata checking. + * - "xrep_" are symbols related to metadata repair. + * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS. + */ + +/* + * Scrub probe -- userspace uses this to probe if we're willing to scrub + * or repair a given mountpoint. This will be used by xfs_scrub to + * probe the kernel's abilities to scrub (and repair) the metadata. We + * do this by validating the ioctl inputs from userspace, preparing the + * filesystem for a scrub (or a repair) operation, and immediately + * returning to userspace. Userspace can use the returned errno and + * structure state to decide (in broad terms) if scrub/repair are + * supported by the running kernel. + */ +static int +xchk_probe( + struct xfs_scrub *sc) +{ + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* Scrub setup and teardown */ + +static inline void +xchk_fsgates_disable( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XCHK_FSGATES_ALL)) + return; + + trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + + if (sc->flags & XCHK_FSGATES_DRAIN) + xfs_drain_wait_disable(); + + sc->flags &= ~XCHK_FSGATES_ALL; +} + +/* Free all the resources and finish the transactions. */ +STATIC int +xchk_teardown( + struct xfs_scrub *sc, + int error) +{ + xchk_ag_free(sc, &sc->sa); + if (sc->tp) { + if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + error = xfs_trans_commit(sc->tp); + else + xfs_trans_cancel(sc->tp); + sc->tp = NULL; + } + if (sc->ip) { + if (sc->ilock_flags) + xchk_iunlock(sc, sc->ilock_flags); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + } + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + if (sc->xfile) { + xfile_destroy(sc->xfile); + sc->xfile = NULL; + } + if (sc->buf) { + if (sc->buf_cleanup) + sc->buf_cleanup(sc->buf); + kvfree(sc->buf); + sc->buf_cleanup = NULL; + sc->buf = NULL; + } + + xchk_fsgates_disable(sc); + return error; +} + +/* Scrubbing dispatch. */ + +static const struct xchk_meta_ops meta_scrub_ops[] = { + [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ + .type = ST_NONE, + .setup = xchk_setup_fs, + .scrub = xchk_probe, + .repair = xrep_probe, + }, + [XFS_SCRUB_TYPE_SB] = { /* superblock */ + .type = ST_PERAG, + .setup = xchk_setup_agheader, + .scrub = xchk_superblock, + .repair = xrep_superblock, + }, + [XFS_SCRUB_TYPE_AGF] = { /* agf */ + .type = ST_PERAG, + .setup = xchk_setup_agheader, + .scrub = xchk_agf, + .repair = xrep_agf, + }, + [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ + .type = ST_PERAG, + .setup = xchk_setup_agheader, + .scrub = xchk_agfl, + .repair = xrep_agfl, + }, + [XFS_SCRUB_TYPE_AGI] = { /* agi */ + .type = ST_PERAG, + .setup = xchk_setup_agheader, + .scrub = xchk_agi, + .repair = xrep_agi, + }, + [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_allocbt, + .scrub = xchk_bnobt, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_allocbt, + .scrub = xchk_cntbt, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_iallocbt, + .scrub = xchk_inobt, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_iallocbt, + .scrub = xchk_finobt, + .has = xfs_has_finobt, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_rmapbt, + .scrub = xchk_rmapbt, + .has = xfs_has_rmapbt, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ + .type = ST_PERAG, + .setup = xchk_setup_ag_refcountbt, + .scrub = xchk_refcountbt, + .has = xfs_has_reflink, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_INODE] = { /* inode record */ + .type = ST_INODE, + .setup = xchk_setup_inode, + .scrub = xchk_inode, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ + .type = ST_INODE, + .setup = xchk_setup_inode_bmap, + .scrub = xchk_bmap_data, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ + .type = ST_INODE, + .setup = xchk_setup_inode_bmap, + .scrub = xchk_bmap_attr, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ + .type = ST_INODE, + .setup = xchk_setup_inode_bmap, + .scrub = xchk_bmap_cow, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_DIR] = { /* directory */ + .type = ST_INODE, + .setup = xchk_setup_directory, + .scrub = xchk_directory, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ + .type = ST_INODE, + .setup = xchk_setup_xattr, + .scrub = xchk_xattr, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ + .type = ST_INODE, + .setup = xchk_setup_symlink, + .scrub = xchk_symlink, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ + .type = ST_INODE, + .setup = xchk_setup_parent, + .scrub = xchk_parent, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ + .type = ST_FS, + .setup = xchk_setup_rtbitmap, + .scrub = xchk_rtbitmap, + .has = xfs_has_realtime, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ + .type = ST_FS, + .setup = xchk_setup_rtsummary, + .scrub = xchk_rtsummary, + .has = xfs_has_realtime, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ + .type = ST_FS, + .setup = xchk_setup_quota, + .scrub = xchk_quota, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ + .type = ST_FS, + .setup = xchk_setup_quota, + .scrub = xchk_quota, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ + .type = ST_FS, + .setup = xchk_setup_quota, + .scrub = xchk_quota, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ + .type = ST_FS, + .setup = xchk_setup_fscounters, + .scrub = xchk_fscounters, + .repair = xrep_notsupported, + }, +}; + +static int +xchk_validate_inputs( + struct xfs_mount *mp, + struct xfs_scrub_metadata *sm) +{ + int error; + const struct xchk_meta_ops *ops; + + error = -EINVAL; + /* Check our inputs. */ + sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) + goto out; + /* sm_reserved[] must be zero */ + if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) + goto out; + + error = -ENOENT; + /* Do we know about this type of metadata? */ + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) + goto out; + ops = &meta_scrub_ops[sm->sm_type]; + if (ops->setup == NULL || ops->scrub == NULL) + goto out; + /* Does this fs even support this type of metadata? */ + if (ops->has && !ops->has(mp)) + goto out; + + error = -EINVAL; + /* restricting fields must be appropriate for type */ + switch (ops->type) { + case ST_NONE: + case ST_FS: + if (sm->sm_ino || sm->sm_gen || sm->sm_agno) + goto out; + break; + case ST_PERAG: + if (sm->sm_ino || sm->sm_gen || + sm->sm_agno >= mp->m_sb.sb_agcount) + goto out; + break; + case ST_INODE: + if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) + goto out; + break; + default: + goto out; + } + + /* No rebuild without repair. */ + if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && + !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return -EINVAL; + + /* + * We only want to repair read-write v5+ filesystems. Defer the check + * for ops->repair until after our scrub confirms that we need to + * perform repairs so that we avoid failing due to not supporting + * repairing an object that doesn't need repairs. + */ + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = -EOPNOTSUPP; + if (!xfs_has_crc(mp)) + goto out; + + error = -EROFS; + if (xfs_is_readonly(mp)) + goto out; + } + + error = 0; +out: + return error; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +static inline void xchk_postmortem(struct xfs_scrub *sc) +{ + /* + * Userspace asked us to repair something, we repaired it, rescanned + * it, and the rescan says it's still broken. Scream about this in + * the system logs. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + xrep_failure(sc->mp); +} +#else +static inline void xchk_postmortem(struct xfs_scrub *sc) +{ + /* + * Userspace asked us to scrub something, it's broken, and we have no + * way of fixing it. Scream in the logs. + */ + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(sc->mp, + "Corruption detected during scrub."); +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +/* Dispatch metadata scrubbing. */ +int +xfs_scrub_metadata( + struct file *file, + struct xfs_scrub_metadata *sm) +{ + struct xchk_stats_run run = { }; + struct xfs_scrub *sc; + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; + u64 check_start; + int error = 0; + + BUILD_BUG_ON(sizeof(meta_scrub_ops) != + (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); + + trace_xchk_start(XFS_I(file_inode(file)), sm, error); + + /* Forbidden if we are shut down or mounted norecovery. */ + error = -ESHUTDOWN; + if (xfs_is_shutdown(mp)) + goto out; + error = -ENOTRECOVERABLE; + if (xfs_has_norecovery(mp)) + goto out; + + error = xchk_validate_inputs(mp, sm); + if (error) + goto out; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); + + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); + if (!sc) { + error = -ENOMEM; + goto out; + } + + sc->mp = mp; + sc->file = file; + sc->sm = sm; + sc->ops = &meta_scrub_ops[sm->sm_type]; + sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); +retry_op: + /* + * When repairs are allowed, prevent freezing or readonly remount while + * scrub is running with a real transaction. + */ + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = mnt_want_write_file(sc->file); + if (error) + goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; + } + + /* Set up for the operation. */ + error = sc->ops->setup(sc); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error) + goto out_teardown; + + /* Scrub for errors. */ + check_start = xchk_stats_now(); + error = sc->ops->scrub(sc); + run.scrub_ns += xchk_stats_elapsed_ns(check_start); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) + goto out_teardown; + + xchk_update_health(sc); + + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED)) { + bool needs_fix = xchk_needs_repair(sc->sm); + + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + needs_fix = true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + needs_fix = true; + + /* + * If userspace asked for a repair but it wasn't necessary, + * report that back to userspace. + */ + if (!needs_fix) { + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; + goto out_nofix; + } + + /* + * If it's broken, userspace wants us to fix it, and we haven't + * already tried to fix it, then attempt a repair. + */ + error = xrep_attempt(sc, &run); + if (error == -EAGAIN) { + /* + * Either the repair function succeeded or it couldn't + * get all the resources it needs; either way, we go + * back to the beginning and call the scrub function. + */ + error = xchk_teardown(sc, 0); + if (error) { + xrep_failure(mp); + goto out_sc; + } + goto retry_op; + } + } + +out_nofix: + xchk_postmortem(sc); +out_teardown: + error = xchk_teardown(sc, error); +out_sc: + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); + kfree(sc); +out: + trace_xchk_done(XFS_I(file_inode(file)), sm, error); + if (error == -EFSCORRUPTED || error == -EFSBADCRC) { + sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + error = 0; + } + return error; +need_drain: + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_NEED_DRAIN; + run.retries++; + goto retry_op; +try_harder: + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down + * everything we hold, then set up again with preparation for + * worst-case scenarios. + */ + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; + run.retries++; + goto retry_op; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h new file mode 100644 index 0000000000..1ef9c6b484 --- /dev/null +++ b/fs/xfs/scrub/scrub.h @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_SCRUB_H__ +#define __XFS_SCRUB_SCRUB_H__ + +struct xfs_scrub; + +/* + * Standard flags for allocating memory within scrub. NOFS context is + * configured by the process allocation scope. Scrub and repair must be able + * to back out gracefully if there isn't enough memory. Force-cast to avoid + * complaints from static checkers. + */ +#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + +/* Type info and names for the scrub types. */ +enum xchk_type { + ST_NONE = 1, /* disabled */ + ST_PERAG, /* per-AG metadata */ + ST_FS, /* per-FS metadata */ + ST_INODE, /* per-inode metadata */ +}; + +struct xchk_meta_ops { + /* Acquire whatever resources are needed for the operation. */ + int (*setup)(struct xfs_scrub *sc); + + /* Examine metadata for errors. */ + int (*scrub)(struct xfs_scrub *); + + /* Repair or optimize the metadata. */ + int (*repair)(struct xfs_scrub *); + + /* Decide if we even have this piece of metadata. */ + bool (*has)(struct xfs_mount *); + + /* type describing required/allowed inputs */ + enum xchk_type type; +}; + +/* Buffer pointers and btree cursors for an entire AG. */ +struct xchk_ag { + struct xfs_perag *pag; + + /* AG btree roots */ + struct xfs_buf *agf_bp; + struct xfs_buf *agi_bp; + + /* AG btrees */ + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur; + struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; +}; + +struct xfs_scrub { + /* General scrub state. */ + struct xfs_mount *mp; + struct xfs_scrub_metadata *sm; + const struct xchk_meta_ops *ops; + struct xfs_trans *tp; + + /* File that scrub was called with. */ + struct file *file; + + /* + * File that is undergoing the scrub operation. This can differ from + * the file that scrub was called with if we're checking file-based fs + * metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for + * something that can't be opened directly (e.g. symlinks). + */ + struct xfs_inode *ip; + + /* Kernel memory buffer used by scrubbers; freed at teardown. */ + void *buf; + + /* + * Clean up resources owned by whatever is in the buffer. Cleanup can + * be deferred with this hook as a means for scrub functions to pass + * data to repair functions. This function must not free the buffer + * itself. + */ + void (*buf_cleanup)(void *buf); + + /* xfile used by the scrubbers; freed at teardown. */ + struct xfile *xfile; + + /* Lock flags for @ip. */ + uint ilock_flags; + + /* See the XCHK/XREP state flags below. */ + unsigned int flags; + + /* + * The XFS_SICK_* flags that correspond to the metadata being scrubbed + * or repaired. We will use this mask to update the in-core fs health + * status with whatever we find. + */ + unsigned int sick_mask; + + /* State tracking for single-AG operations. */ + struct xchk_ag sa; +}; + +/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ +#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ +#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ + +/* + * The XCHK_FSGATES* flags reflect functionality in the main filesystem that + * are only enabled for this particular online fsck. When not in use, the + * features are gated off via dynamic code patching, which is why the state + * must be enabled during scrub setup and can only be torn down afterwards. + */ +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) + +/* Metadata scrubbers */ +int xchk_tester(struct xfs_scrub *sc); +int xchk_superblock(struct xfs_scrub *sc); +int xchk_agf(struct xfs_scrub *sc); +int xchk_agfl(struct xfs_scrub *sc); +int xchk_agi(struct xfs_scrub *sc); +int xchk_bnobt(struct xfs_scrub *sc); +int xchk_cntbt(struct xfs_scrub *sc); +int xchk_inobt(struct xfs_scrub *sc); +int xchk_finobt(struct xfs_scrub *sc); +int xchk_rmapbt(struct xfs_scrub *sc); +int xchk_refcountbt(struct xfs_scrub *sc); +int xchk_inode(struct xfs_scrub *sc); +int xchk_bmap_data(struct xfs_scrub *sc); +int xchk_bmap_attr(struct xfs_scrub *sc); +int xchk_bmap_cow(struct xfs_scrub *sc); +int xchk_directory(struct xfs_scrub *sc); +int xchk_xattr(struct xfs_scrub *sc); +int xchk_symlink(struct xfs_scrub *sc); +int xchk_parent(struct xfs_scrub *sc); +#ifdef CONFIG_XFS_RT +int xchk_rtbitmap(struct xfs_scrub *sc); +int xchk_rtsummary(struct xfs_scrub *sc); +#else +static inline int +xchk_rtbitmap(struct xfs_scrub *sc) +{ + return -ENOENT; +} +static inline int +xchk_rtsummary(struct xfs_scrub *sc) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xchk_quota(struct xfs_scrub *sc); +#else +static inline int +xchk_quota(struct xfs_scrub *sc) +{ + return -ENOENT; +} +#endif +int xchk_fscounters(struct xfs_scrub *sc); + +/* cross-referencing helpers */ +void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len); +void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len); +void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len); +void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo); +void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo); +void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno, + xfs_extlen_t len); +void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); +void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); +void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); +#ifdef CONFIG_XFS_RT +void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, + xfs_extlen_t len); +#else +# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) +#endif + +#endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c new file mode 100644 index 0000000000..cd91db4a55 --- /dev/null +++ b/fs/xfs/scrub/stats.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_sysfs.h" +#include "xfs_btree.h" +#include "xfs_super.h" +#include "scrub/scrub.h" +#include "scrub/stats.h" +#include "scrub/trace.h" + +struct xchk_scrub_stats { + /* all 32-bit counters here */ + + /* checking stats */ + uint32_t invocations; + uint32_t clean; + uint32_t corrupt; + uint32_t preen; + uint32_t xfail; + uint32_t xcorrupt; + uint32_t incomplete; + uint32_t warning; + uint32_t retries; + + /* repair stats */ + uint32_t repair_invocations; + uint32_t repair_success; + + /* all 64-bit items here */ + + /* runtimes */ + uint64_t checktime_us; + uint64_t repairtime_us; + + /* non-counter state must go at the end for clearall */ + spinlock_t css_lock; +}; + +struct xchk_stats { + struct dentry *cs_debugfs; + struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR]; +}; + + +static struct xchk_stats global_stats; + +static const char *name_map[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = "sb", + [XFS_SCRUB_TYPE_AGF] = "agf", + [XFS_SCRUB_TYPE_AGFL] = "agfl", + [XFS_SCRUB_TYPE_AGI] = "agi", + [XFS_SCRUB_TYPE_BNOBT] = "bnobt", + [XFS_SCRUB_TYPE_CNTBT] = "cntbt", + [XFS_SCRUB_TYPE_INOBT] = "inobt", + [XFS_SCRUB_TYPE_FINOBT] = "finobt", + [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt", + [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt", + [XFS_SCRUB_TYPE_INODE] = "inode", + [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd", + [XFS_SCRUB_TYPE_BMBTA] = "bmapbta", + [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc", + [XFS_SCRUB_TYPE_DIR] = "directory", + [XFS_SCRUB_TYPE_XATTR] = "xattr", + [XFS_SCRUB_TYPE_SYMLINK] = "symlink", + [XFS_SCRUB_TYPE_PARENT] = "parent", + [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap", + [XFS_SCRUB_TYPE_RTSUM] = "rtsummary", + [XFS_SCRUB_TYPE_UQUOTA] = "usrquota", + [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", + [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", + [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", +}; + +/* Format the scrub stats into a text buffer, similar to pcp style. */ +STATIC ssize_t +xchk_stats_format( + struct xchk_stats *cs, + char *buf, + size_t remaining) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + ssize_t copied = 0; + int ret = 0; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + ret = scnprintf(buf, remaining, + "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n", + name_map[i], + (unsigned int)css->invocations, + (unsigned int)css->clean, + (unsigned int)css->corrupt, + (unsigned int)css->preen, + (unsigned int)css->xfail, + (unsigned int)css->xcorrupt, + (unsigned int)css->incomplete, + (unsigned int)css->warning, + (unsigned int)css->retries, + (unsigned long long)css->checktime_us, + (unsigned int)css->repair_invocations, + (unsigned int)css->repair_success, + (unsigned long long)css->repairtime_us); + if (ret <= 0) + break; + + remaining -= ret; + copied += ret; + buf += ret; + } + + return copied > 0 ? copied : ret; +} + +/* Estimate the worst case buffer size required to hold the whole report. */ +STATIC size_t +xchk_stats_estimate_bufsize( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + size_t field_width; + size_t ret = 0; + + /* 4294967296 plus one space for each u32 field */ + field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) / + sizeof(uint32_t)); + + /* 18446744073709551615 plus one space for each u64 field */ + field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) - + offsetof(struct xchk_scrub_stats, checktime_us)) / + sizeof(uint64_t)); + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + /* name plus one space */ + ret += 1 + strlen(name_map[i]); + + /* all fields, plus newline */ + ret += field_width + 1; + } + + return ret; +} + +/* Clear all counters. */ +STATIC void +xchk_stats_clearall( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + spin_lock(&css->css_lock); + memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock)); + spin_unlock(&css->css_lock); + } +} + +#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) + +STATIC void +xchk_stats_merge_one( + struct xchk_stats *cs, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + struct xchk_scrub_stats *css; + + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) { + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + return; + } + + css = &cs->cs_stats[sm->sm_type]; + spin_lock(&css->css_lock); + css->invocations++; + if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN)) + css->clean++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + css->corrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + css->preen++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL) + css->xfail++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT) + css->xcorrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + css->incomplete++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING) + css->warning++; + css->retries += run->retries; + css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC); + + if (run->repair_attempted) + css->repair_invocations++; + if (run->repair_succeeded) + css->repair_success++; + css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC); + spin_unlock(&css->css_lock); +} + +/* Merge these scrub-run stats into the global and mount stat data. */ +void +xchk_stats_merge( + struct xfs_mount *mp, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + xchk_stats_merge_one(&global_stats, sm, run); + xchk_stats_merge_one(mp->m_scrub_stats, sm, run); +} + +/* debugfs boilerplate */ + +static ssize_t +xchk_scrub_stats_read( + struct file *file, + char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + char *buf; + size_t bufsize; + ssize_t avail, ret; + + /* + * This generates stringly snapshot of all the scrub counters, so we + * do not want userspace to receive garbled text from multiple calls. + * If the file position is greater than 0, return a short read. + */ + if (*ppos > 0) + return 0; + + bufsize = xchk_stats_estimate_bufsize(cs); + + buf = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!buf) + return -ENOMEM; + + avail = xchk_stats_format(cs, buf, bufsize); + if (avail < 0) { + ret = avail; + goto out; + } + + ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail); +out: + kvfree(buf); + return ret; +} + +static const struct file_operations scrub_stats_fops = { + .open = simple_open, + .read = xchk_scrub_stats_read, +}; + +static ssize_t +xchk_clear_scrub_stats_write( + struct file *file, + const char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + unsigned int val; + int ret; + + ret = kstrtouint_from_user(ubuf, count, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + xchk_stats_clearall(cs); + return count; +} + +static const struct file_operations clear_scrub_stats_fops = { + .open = simple_open, + .write = xchk_clear_scrub_stats_write, +}; + +/* Initialize the stats object. */ +STATIC int +xchk_stats_init( + struct xchk_stats *cs, + struct xfs_mount *mp) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) + spin_lock_init(&css->css_lock); + + return 0; +} + +/* Connect the stats object to debugfs. */ +void +xchk_stats_register( + struct xchk_stats *cs, + struct dentry *parent) +{ + if (!parent) + return; + + cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent); + if (!cs->cs_debugfs) + return; + + debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + &scrub_stats_fops); + debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + &clear_scrub_stats_fops); +} + +/* Free all resources related to the stats object. */ +STATIC int +xchk_stats_teardown( + struct xchk_stats *cs) +{ + return 0; +} + +/* Disconnect the stats object from debugfs. */ +void +xchk_stats_unregister( + struct xchk_stats *cs) +{ + debugfs_remove(cs->cs_debugfs); +} + +/* Initialize global stats and register them */ +int __init +xchk_global_stats_setup( + struct dentry *parent) +{ + int error; + + error = xchk_stats_init(&global_stats, NULL); + if (error) + return error; + + xchk_stats_register(&global_stats, parent); + return 0; +} + +/* Unregister global stats and tear them down */ +void +xchk_global_stats_teardown(void) +{ + xchk_stats_unregister(&global_stats); + xchk_stats_teardown(&global_stats); +} + +/* Allocate per-mount stats */ +int +xchk_mount_stats_alloc( + struct xfs_mount *mp) +{ + struct xchk_stats *cs; + int error; + + cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + error = xchk_stats_init(cs, mp); + if (error) + goto out_free; + + mp->m_scrub_stats = cs; + return 0; +out_free: + kvfree(cs); + return error; +} + +/* Free per-mount stats */ +void +xchk_mount_stats_free( + struct xfs_mount *mp) +{ + xchk_stats_teardown(mp->m_scrub_stats); + kvfree(mp->m_scrub_stats); + mp->m_scrub_stats = NULL; +} diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h new file mode 100644 index 0000000000..b358ad8d8b --- /dev/null +++ b/fs/xfs/scrub/stats.h @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_STATS_H__ +#define __XFS_SCRUB_STATS_H__ + +struct xchk_stats_run { + u64 scrub_ns; + u64 repair_ns; + unsigned int retries; + bool repair_attempted; + bool repair_succeeded; +}; + +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS +struct xchk_stats; + +int __init xchk_global_stats_setup(struct dentry *parent); +void xchk_global_stats_teardown(void); + +int xchk_mount_stats_alloc(struct xfs_mount *mp); +void xchk_mount_stats_free(struct xfs_mount *mp); + +void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent); +void xchk_stats_unregister(struct xchk_stats *cs); + +void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run); + +static inline u64 xchk_stats_now(void) { return ktime_get_ns(); } +static inline u64 xchk_stats_elapsed_ns(u64 since) +{ + u64 now = xchk_stats_now(); + + /* + * If the system doesn't have a high enough resolution clock, charge at + * least one nanosecond so that our stats don't report instantaneous + * runtimes. + */ + if (now == since) + return 1; + + return now - since; +} +#else +# define xchk_global_stats_setup(parent) (0) +# define xchk_global_stats_teardown() ((void)0) +# define xchk_mount_stats_alloc(mp) (0) +# define xchk_mount_stats_free(mp) ((void)0) +# define xchk_stats_register(cs, parent) ((void)0) +# define xchk_stats_unregister(cs) ((void)0) +# define xchk_stats_now() (0) +# define xchk_stats_elapsed_ns(x) (0 * (x)) +# define xchk_stats_merge(mp, sm, run) ((void)0) +#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */ + +#endif /* __XFS_SCRUB_STATS_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c new file mode 100644 index 0000000000..38708fb9a5 --- /dev/null +++ b/fs/xfs/scrub/symlink.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_inode.h" +#include "xfs_symlink.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +/* Set us up to scrub a symbolic link. */ +int +xchk_setup_symlink( + struct xfs_scrub *sc) +{ + /* Allocate the buffer without the inode lock held. */ + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_inode_contents(sc, 0); +} + +/* Symbolic links. */ + +int +xchk_symlink( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp; + loff_t len; + int error = 0; + + if (!S_ISLNK(VFS_I(ip)->i_mode)) + return -ENOENT; + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + len = ip->i_disk_size; + + /* Plausible size? */ + if (len > XFS_SYMLINK_MAXLEN || len <= 0) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Inline symlink? */ + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { + if (len > xfs_inode_data_fork_size(ip) || + len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Remote symlink; must read the contents. */ + error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); +out: + return error; +} diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c new file mode 100644 index 0000000000..46249e7b17 --- /dev/null +++ b/fs/xfs/scrub/trace.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" + +/* Figure out which block the btree cursor was pointing to. */ +static inline xfs_fsblock_t +xchk_btree_cur_fsbno( + struct xfs_btree_cur *cur, + int level) +{ + if (level < cur->bc_nlevels && cur->bc_levels[level].bp) + return XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(cur->bc_levels[level].bp)); + + if (level == cur->bc_nlevels - 1 && + (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); + + return NULLFSBLOCK; +} + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "scrub/trace.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h new file mode 100644 index 0000000000..cbd4d01e25 --- /dev/null +++ b/fs/xfs/scrub/trace.h @@ -0,0 +1,1341 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + * + * NOTE: none of these tracepoints shall be considered a stable kernel ABI + * as they can change at any time. See xfs_trace.h for documentation of + * specific units found in tracepoint output. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs_scrub + +#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_SCRUB_TRACE_H + +#include <linux/tracepoint.h> +#include "xfs_bit.h" + +struct xfile; +struct xfarray; +struct xfarray_sortinfo; + +/* + * ftrace's __print_symbolic requires that all enum values be wrapped in the + * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace + * ring buffer. Somehow this was only worth mentioning in the ftrace sample + * code. + */ +TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); +TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); + +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); + +#define XFS_SCRUB_TYPE_STRINGS \ + { XFS_SCRUB_TYPE_PROBE, "probe" }, \ + { XFS_SCRUB_TYPE_SB, "sb" }, \ + { XFS_SCRUB_TYPE_AGF, "agf" }, \ + { XFS_SCRUB_TYPE_AGFL, "agfl" }, \ + { XFS_SCRUB_TYPE_AGI, "agi" }, \ + { XFS_SCRUB_TYPE_BNOBT, "bnobt" }, \ + { XFS_SCRUB_TYPE_CNTBT, "cntbt" }, \ + { XFS_SCRUB_TYPE_INOBT, "inobt" }, \ + { XFS_SCRUB_TYPE_FINOBT, "finobt" }, \ + { XFS_SCRUB_TYPE_RMAPBT, "rmapbt" }, \ + { XFS_SCRUB_TYPE_REFCNTBT, "refcountbt" }, \ + { XFS_SCRUB_TYPE_INODE, "inode" }, \ + { XFS_SCRUB_TYPE_BMBTD, "bmapbtd" }, \ + { XFS_SCRUB_TYPE_BMBTA, "bmapbta" }, \ + { XFS_SCRUB_TYPE_BMBTC, "bmapbtc" }, \ + { XFS_SCRUB_TYPE_DIR, "directory" }, \ + { XFS_SCRUB_TYPE_XATTR, "xattr" }, \ + { XFS_SCRUB_TYPE_SYMLINK, "symlink" }, \ + { XFS_SCRUB_TYPE_PARENT, "parent" }, \ + { XFS_SCRUB_TYPE_RTBITMAP, "rtbitmap" }, \ + { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ + { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ + { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } + +#define XFS_SCRUB_FLAG_STRINGS \ + { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ + { XFS_SCRUB_OFLAG_CORRUPT, "corrupt" }, \ + { XFS_SCRUB_OFLAG_PREEN, "preen" }, \ + { XFS_SCRUB_OFLAG_XFAIL, "xfail" }, \ + { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ + { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ + { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \ + { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" } + +#define XFS_SCRUB_STATE_STRINGS \ + { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ + { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ + { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_ALREADY_FIXED, "already_fixed" } + +DECLARE_EVENT_CLASS(xchk_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, + int error), + TP_ARGS(ip, sm, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->type = sm->sm_type; + __entry->agno = sm->sm_agno; + __entry->inum = sm->sm_ino; + __entry->gen = sm->sm_gen; + __entry->flags = sm->sm_flags; + __entry->error = error; + ), + TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->agno, + __entry->inum, + __entry->gen, + __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS), + __entry->error) +) +#define DEFINE_SCRUB_EVENT(name) \ +DEFINE_EVENT(xchk_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \ + int error), \ + TP_ARGS(ip, sm, error)) + +DEFINE_SCRUB_EVENT(xchk_start); +DEFINE_SCRUB_EVENT(xchk_done); +DEFINE_SCRUB_EVENT(xchk_deadlock_retry); +DEFINE_SCRUB_EVENT(xrep_attempt); +DEFINE_SCRUB_EVENT(xrep_done); + +DECLARE_EVENT_CLASS(xchk_fsgate_class, + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags), + TP_ARGS(sc, fsgate_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, fsgate_flags) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->fsgate_flags = fsgate_flags; + ), + TP_printk("dev %d:%d type %s fsgates '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS)) +) + +#define DEFINE_SCRUB_FSHOOK_EVENT(name) \ +DEFINE_EVENT(xchk_fsgate_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \ + TP_ARGS(sc, fsgates_flags)) + +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); + +TRACE_EVENT(xchk_op_error, + TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int error, void *ret_ip), + TP_ARGS(sc, agno, bno, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xchk_file_op_error, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t offset, int error, void *ret_ip), + TP_ARGS(sc, whichfork, offset, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->offset, + __entry->error, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xchk_block_error_class, + TP_PROTO(struct xfs_scrub *sc, xfs_daddr_t daddr, void *ret_ip), + TP_ARGS(sc, daddr, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = xfs_daddr_to_agno(sc->mp, daddr); + __entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr); + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->agno, + __entry->agbno, + __entry->ret_ip) +) + +#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xchk_block_error_class, name, \ + TP_PROTO(struct xfs_scrub *sc, xfs_daddr_t daddr, \ + void *ret_ip), \ + TP_ARGS(sc, daddr, ret_ip)) + +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error); +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error); +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen); + +DECLARE_EVENT_CLASS(xchk_ino_error_class, + TP_PROTO(struct xfs_scrub *sc, xfs_ino_t ino, void *ret_ip), + TP_ARGS(sc, ino, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ino; + __entry->type = sc->sm->sm_type; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->ret_ip) +) + +#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \ +DEFINE_EVENT(xchk_ino_error_class, name, \ + TP_PROTO(struct xfs_scrub *sc, xfs_ino_t ino, \ + void *ret_ip), \ + TP_ARGS(sc, ino, ret_ip)) + +DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_error); +DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_preen); +DEFINE_SCRUB_INO_ERROR_EVENT(xchk_ino_warning); + +DECLARE_EVENT_CLASS(xchk_fblock_error_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t offset, void *ret_ip), + TP_ARGS(sc, whichfork, offset, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->offset, + __entry->ret_ip) +); + +#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xchk_fblock_error_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + xfs_fileoff_t offset, void *ret_ip), \ + TP_ARGS(sc, whichfork, offset, ret_ip)) + +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); + +TRACE_EVENT(xchk_incomplete, + TP_PROTO(struct xfs_scrub *sc, void *ret_ip), + TP_ARGS(sc, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->ret_ip) +); + +TRACE_EVENT(xchk_btree_op_error, + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_levels[level].ptr; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xchk_ifork_btree_op_error, + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(int, ptr) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_ino.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->ptr = cur->bc_levels[level].ptr; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xchk_btree_error, + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_levels[level].ptr; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +TRACE_EVENT(xchk_ifork_btree_error, + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_ino.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_levels[level].ptr; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xchk_sbtree_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level), + TP_ARGS(sc, cur, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, level) + __field(int, nlevels) + __field(int, ptr) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->ptr = cur->bc_levels[level].ptr; + ), + TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->bno, + __entry->level, + __entry->nlevels, + __entry->ptr) +) +#define DEFINE_SCRUB_SBTREE_EVENT(name) \ +DEFINE_EVENT(xchk_sbtree_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_btree_cur *cur, \ + int level), \ + TP_ARGS(sc, cur, level)) + +DEFINE_SCRUB_SBTREE_EVENT(xchk_btree_rec); +DEFINE_SCRUB_SBTREE_EVENT(xchk_btree_key); + +TRACE_EVENT(xchk_xref_error, + TP_PROTO(struct xfs_scrub *sc, int error, void *ret_ip), + TP_ARGS(sc, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %s xref error %d ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + +TRACE_EVENT(xchk_inode_is_allocated, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned long, iflags) + __field(umode_t, mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; + __entry->mode = VFS_I(ip)->i_mode; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->mode) +); + +TRACE_EVENT(xchk_fscounters_calc, + TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, + uint64_t fdblocks, uint64_t delalloc), + TP_ARGS(mp, icount, ifree, fdblocks, delalloc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int64_t, icount_sb) + __field(uint64_t, icount_calculated) + __field(int64_t, ifree_sb) + __field(uint64_t, ifree_calculated) + __field(int64_t, fdblocks_sb) + __field(uint64_t, fdblocks_calculated) + __field(uint64_t, delalloc) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->icount_sb = mp->m_sb.sb_icount; + __entry->icount_calculated = icount; + __entry->ifree_sb = mp->m_sb.sb_ifree; + __entry->ifree_calculated = ifree; + __entry->fdblocks_sb = mp->m_sb.sb_fdblocks; + __entry->fdblocks_calculated = fdblocks; + __entry->delalloc = delalloc; + ), + TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount_sb, + __entry->icount_calculated, + __entry->ifree_sb, + __entry->ifree_calculated, + __entry->fdblocks_sb, + __entry->fdblocks_calculated, + __entry->delalloc) +) + +TRACE_EVENT(xchk_fscounters_within_range, + TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value, + int64_t old_value), + TP_ARGS(mp, expected, curr_value, old_value), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint64_t, expected) + __field(int64_t, curr_value) + __field(int64_t, old_value) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->expected = expected; + __entry->curr_value = curr_value; + __entry->old_value = old_value; + ), + TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->expected, + __entry->curr_value, + __entry->old_value) +) + +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + +TRACE_EVENT(xchk_refcount_incorrect, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + xfs_nlink_t seen), + TP_ARGS(pag, irec, seen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_nlink_t, seen) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = irec->rc_domain; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->seen = seen; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->seen) +) + +TRACE_EVENT(xfile_create, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + + __entry->ino = file_inode(xf->file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(xf->file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xfino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xfile_destroy, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); + +DECLARE_EVENT_CLASS(xfile_class, + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), + TP_ARGS(xf, pos, bytecount), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes_used) + __field(loff_t, pos) + __field(loff_t, size) + __field(unsigned long long, bytecount) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes_used = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes_used = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + __entry->pos = pos; + __entry->bytecount = bytecount; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes_used, + __entry->pos, + __entry->bytecount, + __entry->size) +); +#define DEFINE_XFILE_EVENT(name) \ +DEFINE_EVENT(xfile_class, name, \ + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ + TP_ARGS(xf, pos, bytecount)) +DEFINE_XFILE_EVENT(xfile_pread); +DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_seek_data); +DEFINE_XFILE_EVENT(xfile_get_page); +DEFINE_XFILE_EVENT(xfile_put_page); + +TRACE_EVENT(xfarray_create, + TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), + TP_ARGS(xfa, required_capacity), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(uint64_t, max_nr) + __field(size_t, obj_size) + __field(int, obj_size_log) + __field(unsigned long long, required_capacity) + ), + TP_fast_assign( + __entry->max_nr = xfa->max_nr; + __entry->obj_size = xfa->obj_size; + __entry->obj_size_log = xfa->obj_size_log; + __entry->ino = file_inode(xfa->xfile->file)->i_ino; + __entry->required_capacity = required_capacity; + ), + TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d", + __entry->ino, + __entry->max_nr, + __entry->required_capacity, + __entry->obj_size, + __entry->obj_size_log) +); + +TRACE_EVENT(xfarray_isort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_pagesort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_qsort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + __field(int, stack_depth) + __field(int, max_stack_depth) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + __entry->stack_depth = si->stack_depth; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo, + __entry->stack_depth, + __entry->max_stack_depth) +); + +TRACE_EVENT(xfarray_sort, + TP_PROTO(struct xfarray_sortinfo *si, size_t bytes), + TP_ARGS(si, bytes), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(size_t, bytes) + __field(unsigned int, max_stack_depth) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->bytes = bytes; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->max_stack_depth, + __entry->bytes) +); + +TRACE_EVENT(xfarray_sort_stats, + TP_PROTO(struct xfarray_sortinfo *si, int error), + TP_ARGS(si, error), + TP_STRUCT__entry( + __field(unsigned long, ino) +#ifdef DEBUG + __field(unsigned long long, loads) + __field(unsigned long long, stores) + __field(unsigned long long, compares) + __field(unsigned long long, heapsorts) +#endif + __field(unsigned int, max_stack_depth) + __field(unsigned int, max_stack_used) + __field(int, error) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; +#ifdef DEBUG + __entry->loads = si->loads; + __entry->stores = si->stores; + __entry->compares = si->compares; + __entry->heapsorts = si->heapsorts; +#endif + __entry->max_stack_depth = si->max_stack_depth; + __entry->max_stack_used = si->max_stack_used; + __entry->error = error; + ), + TP_printk( +#ifdef DEBUG + "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d", +#else + "xfino 0x%lx stack_depth %u/%u error %d", +#endif + __entry->ino, +#ifdef DEBUG + __entry->loads, + __entry->stores, + __entry->compares, + __entry->heapsorts, +#endif + __entry->max_stack_used, + __entry->max_stack_depth, + __entry->error) +); + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xchk_rtsum_record_free, + TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, + uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v), + TP_ARGS(mp, start, len, log, pos, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtblock_t, start) + __field(unsigned long long, len) + __field(unsigned int, log) + __field(loff_t, pos) + __field(xfs_suminfo_t, v) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->len = len; + __entry->log = log; + __entry->pos = pos; + __entry->v = v; + ), + TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->len, + __entry->log, + __entry->pos, + __entry->v) +); +#endif /* CONFIG_XFS_RT */ + +/* repair tracepoints */ +#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) + +DECLARE_EVENT_CLASS(xrep_extent_class, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(pag, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REPAIR_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_extent_class, name, \ + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(pag, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); +DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); + +DECLARE_EVENT_CLASS(xrep_reap_find_class, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, + bool crosslinked), + TP_ARGS(pag, agbno, len, crosslinked), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(bool, crosslinked) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->crosslinked = crosslinked; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->crosslinked ? 1 : 0) +); +#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ +DEFINE_EVENT(xrep_reap_find_class, name, \ + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ + bool crosslinked), \ + TP_ARGS(pag, agbno, len, crosslinked)) +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); + +DECLARE_EVENT_CLASS(xrep_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_REPAIR_RMAP_EVENT(name) \ +DEFINE_EVENT(xrep_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) +DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); + +TRACE_EVENT(xrep_refcount_extent_fn, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +TRACE_EVENT(xrep_findroot_block, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + uint32_t magic, uint16_t level), + TP_ARGS(mp, agno, agbno, magic, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, magic) + __field(uint16_t, level) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->magic = magic; + __entry->level = level; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->magic, + __entry->level) +) +TRACE_EVENT(xrep_calc_ag_resblks, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + xfs_agblock_t usedlen), + TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, icount) + __field(xfs_agblock_t, aglen) + __field(xfs_agblock_t, freelen) + __field(xfs_agblock_t, usedlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->icount = icount; + __entry->aglen = aglen; + __entry->freelen = freelen; + __entry->usedlen = usedlen; + ), + TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->icount, + __entry->aglen, + __entry->freelen, + __entry->usedlen) +) +TRACE_EVENT(xrep_calc_ag_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, + xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), + TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bnobt_sz) + __field(xfs_agblock_t, inobt_sz) + __field(xfs_agblock_t, rmapbt_sz) + __field(xfs_agblock_t, refcbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bnobt_sz = bnobt_sz; + __entry->inobt_sz = inobt_sz; + __entry->rmapbt_sz = rmapbt_sz; + __entry->refcbt_sz = refcbt_sz; + ), + TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bnobt_sz, + __entry->inobt_sz, + __entry->rmapbt_sz, + __entry->refcbt_sz) +) +TRACE_EVENT(xrep_reset_counters, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + ), + TP_printk("dev %d:%d", + MAJOR(__entry->dev), MINOR(__entry->dev)) +) + +TRACE_EVENT(xrep_ialloc_insert, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, uint16_t holemask, uint8_t count, + uint8_t freecount, uint64_t freemask), + TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->holemask = holemask; + __entry->count = count; + __entry->freecount = freecount; + __entry->freemask = freemask; + ), + TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ + +#endif /* _TRACE_XFS_SCRUB_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE scrub/trace +#include <trace/define_trace.h> diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c new file mode 100644 index 0000000000..f0f532c10a --- /dev/null +++ b/fs/xfs/scrub/xfarray.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" + +/* + * Large Arrays of Fixed-Size Records + * ================================== + * + * This memory array uses an xfile (which itself is a memfd "file") to store + * large numbers of fixed-size records in memory that can be paged out. This + * puts less stress on the memory reclaim algorithms during an online repair + * because we don't have to pin so much memory. However, array access is less + * direct than would be in a regular memory array. Access to the array is + * performed via indexed load and store methods, and an append method is + * provided for convenience. Array elements can be unset, which sets them to + * all zeroes. Unset entries are skipped during iteration, though direct loads + * will return a zeroed buffer. Callers are responsible for concurrency + * control. + */ + +/* + * Pointer to scratch space. Because we can't access the xfile data directly, + * we allocate a small amount of memory on the end of the xfarray structure to + * buffer array items when we need space to store values temporarily. + */ +static inline void *xfarray_scratch(struct xfarray *array) +{ + return (array + 1); +} + +/* Compute array index given an xfile offset. */ +static xfarray_idx_t +xfarray_idx( + struct xfarray *array, + loff_t pos) +{ + if (array->obj_size_log >= 0) + return (xfarray_idx_t)pos >> array->obj_size_log; + + return div_u64((xfarray_idx_t)pos, array->obj_size); +} + +/* Compute xfile offset of array element. */ +static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx) +{ + if (array->obj_size_log >= 0) + return idx << array->obj_size_log; + + return idx * array->obj_size; +} + +/* + * Initialize a big memory array. Array records cannot be larger than a + * page, and the array cannot span more bytes than the page cache supports. + * If @required_capacity is nonzero, the maximum array size will be set to this + * quantity and the array creation will fail if the underlying storage cannot + * support that many records. + */ +int +xfarray_create( + const char *description, + unsigned long long required_capacity, + size_t obj_size, + struct xfarray **arrayp) +{ + struct xfarray *array; + struct xfile *xfile; + int error; + + ASSERT(obj_size < PAGE_SIZE); + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + error = -ENOMEM; + array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS); + if (!array) + goto out_xfile; + + array->xfile = xfile; + array->obj_size = obj_size; + + if (is_power_of_2(obj_size)) + array->obj_size_log = ilog2(obj_size); + else + array->obj_size_log = -1; + + array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE); + trace_xfarray_create(array, required_capacity); + + if (required_capacity > 0) { + if (array->max_nr < required_capacity) { + error = -ENOMEM; + goto out_xfarray; + } + array->max_nr = required_capacity; + } + + *arrayp = array; + return 0; + +out_xfarray: + kfree(array); +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy the array. */ +void +xfarray_destroy( + struct xfarray *array) +{ + xfile_destroy(array->xfile); + kfree(array); +} + +/* Load an element from the array. */ +int +xfarray_load( + struct xfarray *array, + xfarray_idx_t idx, + void *ptr) +{ + if (idx >= array->nr) + return -ENODATA; + + return xfile_obj_load(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); +} + +/* Is this array element potentially unset? */ +static inline bool +xfarray_is_unset( + struct xfarray *array, + loff_t pos) +{ + void *temp = xfarray_scratch(array); + int error; + + if (array->unset_slots == 0) + return false; + + error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + if (!error && xfarray_element_is_null(array, temp)) + return true; + + return false; +} + +/* + * Unset an array element. If @idx is the last element in the array, the + * array will be truncated. Otherwise, the entry will be zeroed. + */ +int +xfarray_unset( + struct xfarray *array, + xfarray_idx_t idx) +{ + void *temp = xfarray_scratch(array); + loff_t pos = xfarray_pos(array, idx); + int error; + + if (idx >= array->nr) + return -ENODATA; + + if (idx == array->nr - 1) { + array->nr--; + return 0; + } + + if (xfarray_is_unset(array, pos)) + return 0; + + memset(temp, 0, array->obj_size); + error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + if (error) + return error; + + array->unset_slots++; + return 0; +} + +/* + * Store an element in the array. The element must not be completely zeroed, + * because those are considered unset sparse elements. + */ +int +xfarray_store( + struct xfarray *array, + xfarray_idx_t idx, + const void *ptr) +{ + int ret; + + if (idx >= array->max_nr) + return -EFBIG; + + ASSERT(!xfarray_element_is_null(array, ptr)); + + ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); + if (ret) + return ret; + + array->nr = max(array->nr, idx + 1); + return 0; +} + +/* Is this array element NULL? */ +bool +xfarray_element_is_null( + struct xfarray *array, + const void *ptr) +{ + return !memchr_inv(ptr, 0, array->obj_size); +} + +/* + * Store an element anywhere in the array that is unset. If there are no + * unset slots, append the element to the array. + */ +int +xfarray_store_anywhere( + struct xfarray *array, + const void *ptr) +{ + void *temp = xfarray_scratch(array); + loff_t endpos = xfarray_pos(array, array->nr); + loff_t pos; + int error; + + /* Find an unset slot to put it in. */ + for (pos = 0; + pos < endpos && array->unset_slots > 0; + pos += array->obj_size) { + error = xfile_obj_load(array->xfile, temp, array->obj_size, + pos); + if (error || !xfarray_element_is_null(array, temp)) + continue; + + error = xfile_obj_store(array->xfile, ptr, array->obj_size, + pos); + if (error) + return error; + + array->unset_slots--; + return 0; + } + + /* No unset slots found; attach it on the end. */ + array->unset_slots = 0; + return xfarray_append(array, ptr); +} + +/* Return length of array. */ +uint64_t +xfarray_length( + struct xfarray *array) +{ + return array->nr; +} + +/* + * Decide which array item we're going to read as part of an _iter_get. + * @cur is the array index, and @pos is the file offset of that array index in + * the backing xfile. Returns ENODATA if we reach the end of the records. + * + * Reading from a hole in a sparse xfile causes page instantiation, so for + * iterating a (possibly sparse) array we need to figure out if the cursor is + * pointing at a totally uninitialized hole and move the cursor up if + * necessary. + */ +static inline int +xfarray_find_data( + struct xfarray *array, + xfarray_idx_t *cur, + loff_t *pos) +{ + unsigned int pgoff = offset_in_page(*pos); + loff_t end_pos = *pos + array->obj_size - 1; + loff_t new_pos; + + /* + * If the current array record is not adjacent to a page boundary, we + * are in the middle of the page. We do not need to move the cursor. + */ + if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE) + return 0; + + /* + * Call SEEK_DATA on the last byte in the record we're about to read. + * If the record ends at (or crosses) the end of a page then we know + * that the first byte of the record is backed by pages and don't need + * to query it. If instead the record begins at the start of the page + * then we know that querying the last byte is just as good as querying + * the first byte, since records cannot be larger than a page. + * + * If the call returns the same file offset, we know this record is + * backed by real pages. We do not need to move the cursor. + */ + new_pos = xfile_seek_data(array->xfile, end_pos); + if (new_pos == -ENXIO) + return -ENODATA; + if (new_pos < 0) + return new_pos; + if (new_pos == end_pos) + return 0; + + /* + * Otherwise, SEEK_DATA told us how far up to move the file pointer to + * find more data. Move the array index to the first record past the + * byte offset we were given. + */ + new_pos = roundup_64(new_pos, array->obj_size); + *cur = xfarray_idx(array, new_pos); + *pos = xfarray_pos(array, *cur); + return 0; +} + +/* + * Starting at *idx, fetch the next non-null array entry and advance the index + * to set up the next _load_next call. Returns ENODATA if we reach the end of + * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first + * call to this function. + */ +int +xfarray_load_next( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + xfarray_idx_t cur = *idx; + loff_t pos = xfarray_pos(array, cur); + int error; + + do { + if (cur >= array->nr) + return -ENODATA; + + /* + * Ask the backing store for the location of next possible + * written record, then retrieve that record. + */ + error = xfarray_find_data(array, &cur, &pos); + if (error) + return error; + error = xfarray_load(array, cur, rec); + if (error) + return error; + + cur++; + pos += array->obj_size; + } while (xfarray_element_is_null(array, rec)); + + *idx = cur; + return 0; +} + +/* Sorting functions */ + +#ifdef DEBUG +# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0) +# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0) +# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0) +# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0) +#else +# define xfarray_sort_bump_loads(si) +# define xfarray_sort_bump_stores(si) +# define xfarray_sort_bump_compares(si) +# define xfarray_sort_bump_heapsorts(si) +#endif /* DEBUG */ + +/* Load an array element for sorting. */ +static inline int +xfarray_sort_load( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_loads(si); + return xfarray_load(si->array, idx, ptr); +} + +/* Store an array element for sorting. */ +static inline int +xfarray_sort_store( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_stores(si); + return xfarray_store(si->array, idx, ptr); +} + +/* Compare an array element for sorting. */ +static inline int +xfarray_sort_cmp( + struct xfarray_sortinfo *si, + const void *a, + const void *b) +{ + xfarray_sort_bump_compares(si); + return si->cmp_fn(a, b); +} + +/* Return a pointer to the low index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si) +{ + return (xfarray_idx_t *)(si + 1); +} + +/* Return a pointer to the high index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_lo(si) + si->max_stack_depth; +} + +/* Size of each element in the quicksort pivot array. */ +static inline size_t +xfarray_pivot_rec_sz( + struct xfarray *array) +{ + return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t); +} + +/* Allocate memory to handle the sort. */ +static inline int +xfarray_sortinfo_alloc( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags, + struct xfarray_sortinfo **infop) +{ + struct xfarray_sortinfo *si; + size_t nr_bytes = sizeof(struct xfarray_sortinfo); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(array); + int max_stack_depth; + + /* + * The median-of-nine pivot algorithm doesn't work if a subset has + * fewer than 9 items. Make sure the in-memory sort will always take + * over for subsets where this wouldn't be the case. + */ + BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR); + + /* + * Tail-call recursion during the partitioning phase means that + * quicksort will never recurse more than log2(nr) times. We need one + * extra level of stack to hold the initial parameters. In-memory + * sort will always take care of the last few levels of recursion for + * us, so we can reduce the stack depth by that much. + */ + max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1); + if (max_stack_depth < 1) + max_stack_depth = 1; + + /* Each level of quicksort uses a lo and a hi index */ + nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2; + + /* Scratchpad for in-memory sort, or finding the pivot */ + nr_bytes += max_t(size_t, + (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz, + XFARRAY_ISORT_NR * array->obj_size); + + si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS); + if (!si) + return -ENOMEM; + + si->array = array; + si->cmp_fn = cmp_fn; + si->flags = flags; + si->max_stack_depth = max_stack_depth; + si->max_stack_used = 1; + + xfarray_sortinfo_lo(si)[0] = 0; + xfarray_sortinfo_hi(si)[0] = array->nr - 1; + + trace_xfarray_sort(si, nr_bytes); + *infop = si; + return 0; +} + +/* Should this sort be terminated by a fatal signal? */ +static inline bool +xfarray_sort_terminated( + struct xfarray_sortinfo *si, + int *error) +{ + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + + if ((si->flags & XFARRAY_SORT_KILLABLE) && + fatal_signal_pending(current)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +/* Do we want an in-memory sort? */ +static inline bool +xfarray_want_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t start, + xfarray_idx_t end) +{ + /* + * For array subsets that fit in the scratchpad, it's much faster to + * use the kernel's heapsort than quicksort's stack machine. + */ + return (end - start) < XFARRAY_ISORT_NR; +} + +/* Return the scratch space within the sortinfo structure. */ +static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* + * Sort a small number of array records using scratchpad memory. The records + * need not be contiguous in the xfile's memory pages. + */ +STATIC int +xfarray_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *scratch = xfarray_sortinfo_isort_scratch(si); + loff_t lo_pos = xfarray_pos(si->array, lo); + loff_t len = xfarray_pos(si->array, hi - lo + 1); + int error; + + trace_xfarray_isort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); +} + +/* Grab a page for sorting records. */ +static inline int +xfarray_sort_get_page( + struct xfarray_sortinfo *si, + loff_t pos, + uint64_t len) +{ + int error; + + error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); + if (error) + return error; + + /* + * xfile pages must never be mapped into userspace, so we skip the + * dcache flush when mapping the page. + */ + si->page_kaddr = kmap_local_page(si->xfpage.page); + return 0; +} + +/* Release a page we grabbed for sorting records. */ +static inline int +xfarray_sort_put_page( + struct xfarray_sortinfo *si) +{ + if (!si->page_kaddr) + return 0; + + kunmap_local(si->page_kaddr); + si->page_kaddr = NULL; + + return xfile_put_page(si->array->xfile, &si->xfpage); +} + +/* Decide if these records are eligible for in-page sorting. */ +static inline bool +xfarray_want_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + pgoff_t lo_page; + pgoff_t hi_page; + loff_t end_pos; + + /* We can only map one page at a time. */ + lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; + end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; + hi_page = end_pos >> PAGE_SHIFT; + + return lo_page == hi_page; +} + +/* Sort a bunch of records that all live in the same memory page. */ +STATIC int +xfarray_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *startp; + loff_t lo_pos = xfarray_pos(si->array, lo); + uint64_t len = xfarray_pos(si->array, hi - lo); + int error = 0; + + trace_xfarray_pagesort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfarray_sort_get_page(si, lo_pos, len); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + startp = si->page_kaddr + offset_in_page(lo_pos); + sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfarray_sort_put_page(si); +} + +/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ +static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* Return a pointer to the start of the pivot array. */ +static inline void * +xfarray_sortinfo_pivot_array( + struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_pivot(si) + si->array->obj_size; +} + +/* The xfarray record is stored at the start of each pivot array element. */ +static inline void * +xfarray_pivot_array_rec( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return pa + (pa_recsz * pa_idx); +} + +/* The xfarray index is stored at the end of each pivot array element. */ +static inline xfarray_idx_t * +xfarray_pivot_array_idx( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) - + sizeof(xfarray_idx_t); +} + +/* + * Find a pivot value for quicksort partitioning, swap it with a[lo], and save + * the cached pivot record for the next step. + * + * Load evenly-spaced records within the given range into memory, sort them, + * and choose the pivot from the median record. Using multiple points will + * improve the quality of the pivot selection, and hopefully avoid the worst + * quicksort behavior, since our array values are nearly always evenly sorted. + */ +STATIC int +xfarray_qsort_pivot( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *pivot = xfarray_sortinfo_pivot(si); + void *parray = xfarray_sortinfo_pivot_array(si); + void *recp; + xfarray_idx_t *idxp; + xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array); + int i, j; + int error; + + ASSERT(step > 0); + + /* + * Load the xfarray indexes of the records we intend to sample into the + * pivot array. + */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0); + *idxp = lo; + for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + *idxp = lo + (i * step); + } + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR - 1); + *idxp = hi; + + /* Load the selected xfarray records into the pivot array. */ + for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) { + xfarray_idx_t idx; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + + /* No unset records; load directly into the array. */ + if (likely(si->array->unset_slots == 0)) { + error = xfarray_sort_load(si, *idxp, recp); + if (error) + return error; + continue; + } + + /* + * Load non-null records into the scratchpad without changing + * the xfarray_idx_t in the pivot array. + */ + idx = *idxp; + xfarray_sort_bump_loads(si); + error = xfarray_load_next(si->array, &idx, recp); + if (error) + return error; + } + + xfarray_sort_bump_heapsorts(si); + sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL); + + /* + * We sorted the pivot array records (which includes the xfarray + * indices) in xfarray record order. The median element of the pivot + * array contains the xfarray record that we will use as the pivot. + * Copy that xfarray record to the designated space. + */ + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + memcpy(pivot, recp, si->array->obj_size); + + /* If the pivot record we chose was already in a[lo] then we're done. */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + if (*idxp == lo) + return 0; + + /* + * Find the cached copy of a[lo] in the pivot array so that we can swap + * a[lo] and a[pivot]. + */ + for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + if (*idxp == lo) + j = i; + } + if (j < 0) { + ASSERT(j >= 0); + return -EFSCORRUPTED; + } + + /* Swap a[lo] and a[pivot]. */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + return error; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + return xfarray_sort_store(si, *idxp, recp); +} + +/* + * Set up the pointers for the next iteration. We push onto the stack all of + * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the + * current stack frame to point to the unsorted values between a[beg[i]] and + * a[lo] so that those values will be sorted when we pop the stack. + */ +static inline int +xfarray_qsort_push( + struct xfarray_sortinfo *si, + xfarray_idx_t *si_lo, + xfarray_idx_t *si_hi, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + /* Check for stack overflows */ + if (si->stack_depth >= si->max_stack_depth - 1) { + ASSERT(si->stack_depth < si->max_stack_depth - 1); + return -EFSCORRUPTED; + } + + si->max_stack_used = max_t(uint8_t, si->max_stack_used, + si->stack_depth + 2); + + si_lo[si->stack_depth + 1] = lo + 1; + si_hi[si->stack_depth + 1] = si_hi[si->stack_depth]; + si_hi[si->stack_depth++] = lo - 1; + + /* + * Always start with the smaller of the two partitions to keep the + * amount of recursion in check. + */ + if (si_hi[si->stack_depth] - si_lo[si->stack_depth] > + si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) { + swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]); + swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]); + } + + return 0; +} + +/* + * Load an element from the array into the first scratchpad and cache the page, + * if possible. + */ +static inline int +xfarray_sort_load_cached( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + loff_t idx_pos = xfarray_pos(si->array, idx); + pgoff_t startpage; + pgoff_t endpage; + int error = 0; + + /* + * If this load would split a page, release the cached page, if any, + * and perform a traditional read. + */ + startpage = idx_pos >> PAGE_SHIFT; + endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; + if (startpage != endpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + + if (xfarray_sort_terminated(si, &error)) + return error; + + return xfile_obj_load(si->array->xfile, ptr, + si->array->obj_size, idx_pos); + } + + /* If the cached page is not the one we want, release it. */ + if (xfile_page_cached(&si->xfpage) && + xfile_page_index(&si->xfpage) != startpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + } + + /* + * If we don't have a cached page (and we know the load is contained + * in a single page) then grab it. + */ + if (!xfile_page_cached(&si->xfpage)) { + if (xfarray_sort_terminated(si, &error)) + return error; + + error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, + PAGE_SIZE); + if (error) + return error; + } + + memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), + si->array->obj_size); + return 0; +} + +/* + * Sort the array elements via quicksort. This implementation incorporates + * four optimizations discussed in Sedgewick: + * + * 1. Use an explicit stack of array indices to store the next array partition + * to sort. This helps us to avoid recursion in the call stack, which is + * particularly expensive in the kernel. + * + * 2. For arrays with records in arbitrary or user-controlled order, choose the + * pivot element using a median-of-nine decision tree. This reduces the + * probability of selecting a bad pivot value which causes worst case + * behavior (i.e. partition sizes of 1). + * + * 3. The smaller of the two sub-partitions is pushed onto the stack to start + * the next level of recursion, and the larger sub-partition replaces the + * current stack frame. This guarantees that we won't need more than + * log2(nr) stack space. + * + * 4. For small sets, load the records into the scratchpad and run heapsort on + * them because that is very fast. In the author's experience, this yields + * a ~10% reduction in runtime. + * + * If a small set is contained entirely within a single xfile memory page, + * map the page directly and run heap sort directly on the xfile page + * instead of using the load/store interface. This halves the runtime. + * + * 5. This optimization is specific to the implementation. When converging lo + * and hi after selecting a pivot, we will try to retain the xfile memory + * page between load calls, which reduces run time by 50%. + */ + +/* + * Due to the use of signed indices, we can only support up to 2^63 records. + * Files can only grow to 2^63 bytes, so this is not much of a limitation. + */ +#define QSORT_MAX_RECS (1ULL << 63) + +int +xfarray_sort( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags) +{ + struct xfarray_sortinfo *si; + xfarray_idx_t *si_lo, *si_hi; + void *pivot; + void *scratch = xfarray_scratch(array); + xfarray_idx_t lo, hi; + int error = 0; + + if (array->nr < 2) + return 0; + if (array->nr >= QSORT_MAX_RECS) + return -E2BIG; + + error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si); + if (error) + return error; + si_lo = xfarray_sortinfo_lo(si); + si_hi = xfarray_sortinfo_hi(si); + pivot = xfarray_sortinfo_pivot(si); + + while (si->stack_depth >= 0) { + lo = si_lo[si->stack_depth]; + hi = si_hi[si->stack_depth]; + + trace_xfarray_qsort(si, lo, hi); + + /* Nothing left in this partition to sort; pop stack. */ + if (lo >= hi) { + si->stack_depth--; + continue; + } + + /* + * If directly mapping the page and sorting can solve our + * problems, we're done. + */ + if (xfarray_want_pagesort(si, lo, hi)) { + error = xfarray_pagesort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* If insertion sort can solve our problems, we're done. */ + if (xfarray_want_isort(si, lo, hi)) { + error = xfarray_isort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* Pick a pivot, move it to a[lo] and stash it. */ + error = xfarray_qsort_pivot(si, lo, hi); + if (error) + goto out_free; + + /* + * Rearrange a[lo..hi] such that everything smaller than the + * pivot is on the left side of the range and everything larger + * than the pivot is on the right side of the range. + */ + while (lo < hi) { + /* + * Decrement hi until it finds an a[hi] less than the + * pivot value. + */ + error = xfarray_sort_load_cached(si, hi, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && + lo < hi) { + hi--; + error = xfarray_sort_load_cached(si, hi, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[hi]) to a[lo]. */ + if (lo < hi) { + error = xfarray_sort_store(si, lo++, scratch); + if (error) + goto out_free; + } + + /* + * Increment lo until it finds an a[lo] greater than + * the pivot value. + */ + error = xfarray_sort_load_cached(si, lo, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && + lo < hi) { + lo++; + error = xfarray_sort_load_cached(si, lo, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[lo]) to a[hi]. */ + if (lo < hi) { + error = xfarray_sort_store(si, hi--, scratch); + if (error) + goto out_free; + } + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + + /* + * Put our pivot value in the correct place at a[lo]. All + * values between a[beg[i]] and a[lo - 1] should be less than + * the pivot; and all values between a[lo + 1] and a[end[i]-1] + * should be greater than the pivot. + */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + goto out_free; + + /* Set up the stack frame to process the two partitions. */ + error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + +out_free: + trace_xfarray_sort_stats(si, error); + kvfree(si); + return error; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h new file mode 100644 index 0000000000..4ecac01363 --- /dev/null +++ b/fs/xfs/scrub/xfarray.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFARRAY_H__ +#define __XFS_SCRUB_XFARRAY_H__ + +/* xfile array index type, along with cursor initialization */ +typedef uint64_t xfarray_idx_t; +#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) + +/* Iterate each index of an xfile array. */ +#define foreach_xfarray_idx(array, idx) \ + for ((idx) = XFARRAY_CURSOR_INIT; \ + (idx) < xfarray_length(array); \ + (idx)++) + +struct xfarray { + /* Underlying file that backs the array. */ + struct xfile *xfile; + + /* Number of array elements. */ + xfarray_idx_t nr; + + /* Maximum possible array size. */ + xfarray_idx_t max_nr; + + /* Number of unset slots in the array below @nr. */ + uint64_t unset_slots; + + /* Size of an array element. */ + size_t obj_size; + + /* log2 of array element size, if possible. */ + int obj_size_log; +}; + +int xfarray_create(const char *descr, unsigned long long required_capacity, + size_t obj_size, struct xfarray **arrayp); +void xfarray_destroy(struct xfarray *array); +int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr); +int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); +int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); +int xfarray_store_anywhere(struct xfarray *array, const void *ptr); +bool xfarray_element_is_null(struct xfarray *array, const void *ptr); + +/* Append an element to the array. */ +static inline int xfarray_append(struct xfarray *array, const void *ptr) +{ + return xfarray_store(array, array->nr, ptr); +} + +uint64_t xfarray_length(struct xfarray *array); +int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); + +/* Declarations for xfile array sort functionality. */ + +typedef cmp_func_t xfarray_cmp_fn; + +/* Perform an in-memory heapsort for small subsets. */ +#define XFARRAY_ISORT_SHIFT (4) +#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT) + +/* Evalulate this many points to find the qsort pivot. */ +#define XFARRAY_QSORT_PIVOT_NR (9) + +struct xfarray_sortinfo { + struct xfarray *array; + + /* Comparison function for the sort. */ + xfarray_cmp_fn cmp_fn; + + /* Maximum height of the partition stack. */ + uint8_t max_stack_depth; + + /* Current height of the partition stack. */ + int8_t stack_depth; + + /* Maximum stack depth ever used. */ + uint8_t max_stack_used; + + /* XFARRAY_SORT_* flags; see below. */ + unsigned int flags; + + /* Cache a page here for faster access. */ + struct xfile_page xfpage; + void *page_kaddr; + +#ifdef DEBUG + /* Performance statistics. */ + uint64_t loads; + uint64_t stores; + uint64_t compares; + uint64_t heapsorts; +#endif + /* + * Extra bytes are allocated beyond the end of the structure to store + * quicksort information. C does not permit multiple VLAs per struct, + * so we document all of this in a comment. + * + * Pretend that we have a typedef for array records: + * + * typedef char[array->obj_size] xfarray_rec_t; + * + * First comes the quicksort partition stack: + * + * xfarray_idx_t lo[max_stack_depth]; + * xfarray_idx_t hi[max_stack_depth]; + * + * union { + * + * If for a given subset we decide to use an in-memory sort, we use a + * block of scratchpad records here to compare items: + * + * xfarray_rec_t scratch[ISORT_NR]; + * + * Otherwise, we want to partition the records to partition the array. + * We store the chosen pivot record at the start of the scratchpad area + * and use the rest to sample some records to estimate the median. + * The format of the qsort_pivot array enables us to use the kernel + * heapsort function to place the median value in the middle. + * + * struct { + * xfarray_rec_t pivot; + * struct { + * xfarray_rec_t rec; (rounded up to 8 bytes) + * xfarray_idx_t idx; + * } qsort_pivot[QSORT_PIVOT_NR]; + * }; + * } + */ +}; + +/* Sort can be interrupted by a fatal signal. */ +#define XFARRAY_SORT_KILLABLE (1U << 0) + +int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn, + unsigned int flags); + +#endif /* __XFS_SCRUB_XFARRAY_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c new file mode 100644 index 0000000000..090c3ead43 --- /dev/null +++ b/fs/xfs/scrub/xfile.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" +#include <linux/shmem_fs.h> + +/* + * Swappable Temporary Memory + * ========================== + * + * Online checking sometimes needs to be able to stage a large amount of data + * in memory. This information might not fit in the available memory and it + * doesn't all need to be accessible at all times. In other words, we want an + * indexed data buffer to store data that can be paged out. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xfile must be freed with xfile_destroy. + * + * xfiles assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * NOTE: The current shmemfs implementation has a quirk that in-kernel reads + * of a hole cause a page to be mapped into the file. If you are going to + * create a sparse xfile, please be careful about reading from uninitialized + * parts of the file. These pages are !Uptodate and will eventually be + * reclaimed if not written, but in the short term this boosts memory + * consumption. + */ + +/* + * xfiles must not be exposed to userspace and require upper layers to + * coordinate access to the one handle returned by the constructor, so + * establish a separate lock class for xfiles to avoid confusing lockdep. + */ +static struct lock_class_key xfile_i_mutex_key; + +/* + * Create an xfile of the given size. The description will be used in the + * trace output. + */ +int +xfile_create( + const char *description, + loff_t isize, + struct xfile **xfilep) +{ + struct inode *inode; + struct xfile *xf; + int error = -ENOMEM; + + xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); + if (!xf) + return -ENOMEM; + + xf->file = shmem_file_setup(description, isize, 0); + if (!xf->file) + goto out_xfile; + if (IS_ERR(xf->file)) { + error = PTR_ERR(xf->file); + goto out_xfile; + } + + /* + * We want a large sparse file that we can pread, pwrite, and seek. + * xfile users are responsible for keeping the xfile hidden away from + * all other callers, so we skip timestamp updates and security checks. + * Make the inode only accessible by root, just in case the xfile ever + * escapes. + */ + xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | + FMODE_LSEEK; + xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; + inode = file_inode(xf->file); + inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; + inode->i_mode &= ~0177; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + + lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + + trace_xfile_create(xf); + + *xfilep = xf; + return 0; +out_xfile: + kfree(xf); + return error; +} + +/* Close the file and release all resources. */ +void +xfile_destroy( + struct xfile *xf) +{ + struct inode *inode = file_inode(xf->file); + + trace_xfile_destroy(xf); + + lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); + fput(xf->file); + kfree(xf); +} + +/* + * Read a memory object directly from the xfile's page cache. Unlike regular + * pread, we return -E2BIG and -EFBIG for reads that are too large or at too + * high an offset, instead of truncating the read. Otherwise, we return + * bytes read or an error code, like regular pread. + */ +ssize_t +xfile_pread( + struct xfile *xf, + void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + struct page *page = NULL; + ssize_t read = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pread(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *p, *kaddr; + unsigned int len; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * In-kernel reads of a shmem file cause it to allocate a page + * if the mapping shows a hole. Therefore, if we hit ENOMEM + * we can continue by zeroing the caller's buffer. + */ + page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, + __GFP_NOWARN); + if (IS_ERR(page)) { + error = PTR_ERR(page); + if (error != -ENOMEM) + break; + + memset(buf, 0, len); + goto advance; + } + + if (PageUptodate(page)) { + /* + * xfile pages must never be mapped into userspace, so + * we skip the dcache flush. + */ + kaddr = kmap_local_page(page); + p = kaddr + offset_in_page(pos); + memcpy(buf, p, len); + kunmap_local(kaddr); + } else { + memset(buf, 0, len); + } + put_page(page); + +advance: + count -= len; + pos += len; + buf += len; + read += len; + } + memalloc_nofs_restore(pflags); + + if (read > 0) + return read; + return error; +} + +/* + * Write a memory object directly to the xfile's page cache. Unlike regular + * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too + * high an offset, instead of truncating the write. Otherwise, we return + * bytes written or an error code, like regular pwrite. + */ +ssize_t +xfile_pwrite( + struct xfile *xf, + const void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + ssize_t written = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pwrite(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *fsdata = NULL; + void *p, *kaddr; + unsigned int len; + int ret; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. + * shmem doesn't support fs freeze, but lockdep doesn't know + * that and will trip over that. + */ + error = aops->write_begin(NULL, mapping, pos, len, &page, + &fsdata); + if (error) + break; + + /* + * xfile pages must never be mapped into userspace, so we skip + * the dcache flush. If the page is not uptodate, zero it + * before writing data. + */ + kaddr = kmap_local_page(page); + if (!PageUptodate(page)) { + memset(kaddr, 0, PAGE_SIZE); + SetPageUptodate(page); + } + p = kaddr + offset_in_page(pos); + memcpy(p, buf, len); + kunmap_local(kaddr); + + ret = aops->write_end(NULL, mapping, pos, len, len, page, + fsdata); + if (ret < 0) { + error = ret; + break; + } + + written += ret; + if (ret != len) + break; + + count -= ret; + pos += ret; + buf += ret; + } + memalloc_nofs_restore(pflags); + + if (written > 0) + return written; + return error; +} + +/* Find the next written area in the xfile data for a given offset. */ +loff_t +xfile_seek_data( + struct xfile *xf, + loff_t pos) +{ + loff_t ret; + + ret = vfs_llseek(xf->file, pos, SEEK_DATA); + trace_xfile_seek_data(xf, pos, ret); + return ret; +} + +/* Query stat information for an xfile. */ +int +xfile_stat( + struct xfile *xf, + struct xfile_stat *statbuf) +{ + struct kstat ks; + int error; + + error = vfs_getattr_nosec(&xf->file->f_path, &ks, + STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); + if (error) + return error; + + statbuf->size = ks.size; + statbuf->bytes = ks.blocks << SECTOR_SHIFT; + return 0; +} + +/* + * Grab the (locked) page for a memory object. The object cannot span a page + * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we + * cannot grab the page, or the usual negative errno. + */ +int +xfile_get_page( + struct xfile *xf, + loff_t pos, + unsigned int len, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + void *fsdata = NULL; + loff_t key = round_down(pos, PAGE_SIZE); + unsigned int pflags; + int error; + + if (inode->i_sb->s_maxbytes - pos < len) + return -ENOMEM; + if (len > PAGE_SIZE - offset_in_page(pos)) + return -ENOTBLK; + + trace_xfile_get_page(xf, pos, len); + + pflags = memalloc_nofs_save(); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. shmem + * doesn't support fs freeze, but lockdep doesn't know that and will + * trip over that. + */ + error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, + &fsdata); + if (error) + goto out_pflags; + + /* We got the page, so make sure we push out EOF. */ + if (i_size_read(inode) < pos + len) + i_size_write(inode, pos + len); + + /* + * If the page isn't up to date, fill it with zeroes before we hand it + * to the caller and make sure the backing store will hold on to them. + */ + if (!PageUptodate(page)) { + void *kaddr; + + kaddr = kmap_local_page(page); + memset(kaddr, 0, PAGE_SIZE); + kunmap_local(kaddr); + SetPageUptodate(page); + } + + /* + * Mark each page dirty so that the contents are written to some + * backing store when we drop this buffer, and take an extra reference + * to prevent the xfile page from being swapped or removed from the + * page cache by reclaim if the caller unlocks the page. + */ + set_page_dirty(page); + get_page(page); + + xfpage->page = page; + xfpage->fsdata = fsdata; + xfpage->pos = key; +out_pflags: + memalloc_nofs_restore(pflags); + return error; +} + +/* + * Release the (locked) page for a memory object. Returns 0 or a negative + * errno. + */ +int +xfile_put_page( + struct xfile *xf, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + unsigned int pflags; + int ret; + + trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); + + /* Give back the reference that we took in xfile_get_page. */ + put_page(xfpage->page); + + pflags = memalloc_nofs_save(); + ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, + xfpage->page, xfpage->fsdata); + memalloc_nofs_restore(pflags); + memset(xfpage, 0, sizeof(struct xfile_page)); + + if (ret < 0) + return ret; + if (ret != PAGE_SIZE) + return -EIO; + return 0; +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h new file mode 100644 index 0000000000..d56643b0f4 --- /dev/null +++ b/fs/xfs/scrub/xfile.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFILE_H__ +#define __XFS_SCRUB_XFILE_H__ + +struct xfile_page { + struct page *page; + void *fsdata; + loff_t pos; +}; + +static inline bool xfile_page_cached(const struct xfile_page *xfpage) +{ + return xfpage->page != NULL; +} + +static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) +{ + return xfpage->page->index; +} + +struct xfile { + struct file *file; +}; + +int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); +void xfile_destroy(struct xfile *xf); + +ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); +ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, + loff_t pos); + +/* + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pread(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +/* + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pwrite(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +loff_t xfile_seek_data(struct xfile *xf, loff_t pos); + +struct xfile_stat { + loff_t size; + unsigned long long bytes; +}; + +int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); + +int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, + struct xfile_page *xbuf); +int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); + +#endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h new file mode 100644 index 0000000000..a39befa743 --- /dev/null +++ b/fs/xfs/scrub/xfs_scrub.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_H__ +#define __XFS_SCRUB_H__ + +#ifndef CONFIG_XFS_ONLINE_SCRUB +# define xfs_scrub_metadata(file, sm) (-ENOTTY) +#else +int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +#endif /* CONFIG_XFS_ONLINE_SCRUB */ + +#endif /* __XFS_SCRUB_H__ */ |