summaryrefslogtreecommitdiffstats
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/xfs/libxfs
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_ag.c1042
-rw-r--r--fs/xfs/libxfs/xfs_ag.h242
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c426
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h55
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c3568
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h250
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c641
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h68
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1600
-rw-r--r--fs/xfs/libxfs/xfs_attr.h621
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c3002
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h111
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c712
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h20
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h51
-rw-r--r--fs/xfs/libxfs/xfs_bit.c106
-rw-r--r--fs/xfs/libxfs/xfs_bit.h75
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c6230
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h270
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c703
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h118
-rw-r--r--fs/xfs/libxfs/xfs_btree.c5099
-rw-r--r--fs/xfs/libxfs/xfs_btree.h606
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c880
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h123
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h82
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2698
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h239
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h805
-rw-r--r--fs/xfs/libxfs/xfs_defer.c930
-rw-r--r--fs/xfs/libxfs/xfs_defer.h131
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c769
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h251
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c1275
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c1223
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c1824
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c2337
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h209
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c1293
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c325
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h113
-rw-r--r--fs/xfs/libxfs/xfs_format.h1829
-rw-r--r--fs/xfs/libxfs/xfs_fs.h851
-rw-r--r--fs/xfs/libxfs/xfs_health.h190
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c2969
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h112
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c833
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h83
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c1050
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c773
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h54
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c779
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h268
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h993
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h134
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c201
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h146
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1915
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h126
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c545
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h73
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2826
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h220
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c696
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h67
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c1098
-rw-r--r--fs/xfs/libxfs/xfs_sb.c1317
-rw-r--r--fs/xfs/libxfs/xfs_sb.h41
-rw-r--r--fs/xfs/libxfs/xfs_shared.h193
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c233
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c225
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c1028
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h105
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h106
-rw-r--r--fs/xfs/libxfs/xfs_types.c230
-rw-r--r--fs/xfs/libxfs/xfs_types.h230
76 files changed, 63662 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
new file mode 100644
index 000000000..bb0c700af
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -0,0 +1,1042 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+
+
+/*
+ * Passive reference counting access wrappers to the perag structures. If the
+ * per-ag structure is to be freed, the freeing code is responsible for cleaning
+ * up objects with passive references before freeing the structure. This is
+ * things like cached buffers.
+ */
+struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ref = 0;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ ASSERT(atomic_read(&pag->pag_ref) >= 0);
+ ref = atomic_inc_return(&pag->pag_ref);
+ }
+ rcu_read_unlock();
+ trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ unsigned int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
+void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ int ref;
+
+ ASSERT(atomic_read(&pag->pag_ref) > 0);
+ ref = atomic_dec_return(&pag->pag_ref);
+ trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
+{
+ xfs_agnumber_t index;
+ struct xfs_perag *pag;
+ struct xfs_sb *sbp = &mp->m_sb;
+ uint64_t ifree = 0;
+ uint64_t ialloc = 0;
+ uint64_t bfree = 0;
+ uint64_t bfreelst = 0;
+ uint64_t btree = 0;
+ uint64_t fdblocks;
+ int error = 0;
+
+ for (index = 0; index < agcount; index++) {
+ /*
+ * Read the AGF and AGI buffers to populate the per-ag
+ * structures for us.
+ */
+ pag = xfs_perag_get(mp, index);
+ error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+ if (!error)
+ error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ if (error) {
+ xfs_perag_put(pag);
+ return error;
+ }
+
+ ifree += pag->pagi_freecount;
+ ialloc += pag->pagi_count;
+ bfree += pag->pagf_freeblks;
+ bfreelst += pag->pagf_flcount;
+ btree += pag->pagf_btreeblks;
+ xfs_perag_put(pag);
+ }
+ fdblocks = bfree + bfreelst + btree;
+
+ /*
+ * If the new summary counts are obviously incorrect, fail the
+ * mount operation because that implies the AGFs are also corrupt.
+ * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
+ * will prevent xfs_repair from fixing anything.
+ */
+ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
+ xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Overwrite incore superblock counters with just-read data */
+ spin_lock(&mp->m_sb_lock);
+ sbp->sb_ifree = ifree;
+ sbp->sb_icount = ialloc;
+ sbp->sb_fdblocks = fdblocks;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_reinit_percpu_counters(mp);
+out:
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+ return error;
+}
+
+STATIC void
+__xfs_free_perag(
+ struct rcu_head *head)
+{
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+ kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+void
+xfs_free_perag(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, agno);
+ spin_unlock(&mp->m_perag_lock);
+ ASSERT(pag);
+ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ xfs_buf_hash_destroy(pag);
+
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
+ }
+}
+
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
+{
+ ASSERT(agno < agcount);
+
+ if (agno < agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agblock_t eoag,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
+int
+xfs_initialize_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks,
+ xfs_agnumber_t *maxagi)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+ xfs_agnumber_t first_initialised = NULLAGNUMBER;
+ int error;
+
+ /*
+ * Walk the current per-ag tree so we don't try to initialise AGs
+ * that already exist (growfs case). Allocate and insert all the
+ * AGs we don't find ready for initialisation.
+ */
+ for (index = 0; index < agcount; index++) {
+ pag = xfs_perag_get(mp, index);
+ if (pag) {
+ xfs_perag_put(pag);
+ continue;
+ }
+
+ pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ if (!pag) {
+ error = -ENOMEM;
+ goto out_unwind_new_pags;
+ }
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+
+ error = radix_tree_preload(GFP_NOFS);
+ if (error)
+ goto out_free_pag;
+
+ spin_lock(&mp->m_perag_lock);
+ if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+ WARN_ON_ONCE(1);
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+ error = -EEXIST;
+ goto out_free_pag;
+ }
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+
+#ifdef __KERNEL__
+ /* Place kernel structure only init below this point. */
+ spin_lock_init(&pag->pag_ici_lock);
+ spin_lock_init(&pag->pagb_lock);
+ spin_lock_init(&pag->pag_state_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ init_waitqueue_head(&pag->pagb_wait);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
+#endif /* __KERNEL__ */
+
+ error = xfs_buf_hash_init(pag);
+ if (error)
+ goto out_remove_pag;
+
+ /* first new pag is fully initialized */
+ if (first_initialised == NULLAGNUMBER)
+ first_initialised = index;
+
+ /*
+ * Pre-calculated geometry
+ */
+ pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ dblocks);
+ pag->min_block = XFS_AGFL_BLOCK(mp);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ }
+
+ index = xfs_set_inode_alloc(mp, agcount);
+
+ if (maxagi)
+ *maxagi = index;
+
+ mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
+ return 0;
+
+out_remove_pag:
+ radix_tree_delete(&mp->m_perag_tree, index);
+out_free_pag:
+ kmem_free(pag);
+out_unwind_new_pags:
+ /* unwind any prior newly initialized pags */
+ for (index = first_initialised; index < agcount; index++) {
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ kmem_free(pag);
+ }
+ return error;
+}
+
+static int
+xfs_get_aghdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ if (error)
+ return error;
+
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Generic btree root block init function
+ */
+static void
+xfs_btroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+}
+
+/* Finish initializing a free space btree. */
+static void
+xfs_freesp_init_recs(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+
+ if (xfs_ag_contains_log(mp, id->agno)) {
+ struct xfs_alloc_rec *nrec;
+ xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
+ mp->m_sb.sb_logstart);
+
+ ASSERT(start >= mp->m_ag_prealloc_blocks);
+ if (start != mp->m_ag_prealloc_blocks) {
+ /*
+ * Modify first record to pad stripe align of log
+ */
+ arec->ar_blockcount = cpu_to_be32(start -
+ mp->m_ag_prealloc_blocks);
+ nrec = arec + 1;
+
+ /*
+ * Insert second record at start of internal log
+ * which then gets trimmed.
+ */
+ nrec->ar_startblock = cpu_to_be32(
+ be32_to_cpu(arec->ar_startblock) +
+ be32_to_cpu(arec->ar_blockcount));
+ arec = nrec;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+ /*
+ * Change record start to after the internal log
+ */
+ be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
+ }
+
+ /*
+ * Calculate the record block count and check for the case where
+ * the log might have consumed all available space in the AG. If
+ * so, reset the record count to 0 to avoid exposure of an invalid
+ * record start block.
+ */
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+ if (!arec->ar_blockcount)
+ block->bb_numrecs = 0;
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
+}
+
+static void
+xfs_cntroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_rmap_rec *rrec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+
+ /*
+ * mark the AG header regions as static metadata The BNO
+ * btree block is the first block after the headers, so
+ * it's location defines the size of region the static
+ * metadata consumes.
+ *
+ * Note: unlike mkfs, we never have to account for log
+ * space when growing the data regions
+ */
+ rrec = XFS_RMAP_REC_ADDR(block, 1);
+ rrec->rm_startblock = 0;
+ rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+ rrec->rm_offset = 0;
+
+ /* account freespace btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 2);
+ rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(2);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account inode btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 3);
+ rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+ XFS_IBT_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+ rrec->rm_offset = 0;
+
+ /* account for rmap btree root */
+ rrec = XFS_RMAP_REC_ADDR(block, 4);
+ rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account for refc btree root */
+ if (xfs_has_reflink(mp)) {
+ rrec = XFS_RMAP_REC_ADDR(block, 5);
+ rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+
+ /* account for the log space */
+ if (xfs_ag_contains_log(mp, id->agno)) {
+ rrec = XFS_RMAP_REC_ADDR(block,
+ be16_to_cpu(block->bb_numrecs) + 1);
+ rrec->rm_startblock = cpu_to_be32(
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
+ rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+}
+
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_dsb *dsb = bp->b_addr;
+
+ xfs_sb_to_disk(dsb, &mp->m_sb);
+ dsb->sb_inprogress = 1;
+}
+
+static void
+xfs_agfblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agf *agf = bp->b_addr;
+ xfs_extlen_t tmpsize;
+
+ agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+ agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+ agf->agf_seqno = cpu_to_be32(id->agno);
+ agf->agf_length = cpu_to_be32(id->agsize);
+ agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ if (xfs_has_rmapbt(mp)) {
+ agf->agf_roots[XFS_BTNUM_RMAPi] =
+ cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
+ }
+
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
+ agf->agf_flcount = 0;
+ tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
+ agf->agf_freeblks = cpu_to_be32(tmpsize);
+ agf->agf_longest = cpu_to_be32(tmpsize);
+ if (xfs_has_crc(mp))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_has_reflink(mp)) {
+ agf->agf_refcount_root = cpu_to_be32(
+ xfs_refc_block(mp));
+ agf->agf_refcount_level = cpu_to_be32(1);
+ agf->agf_refcount_blocks = cpu_to_be32(1);
+ }
+
+ if (xfs_ag_contains_log(mp, id->agno)) {
+ int64_t logblocks = mp->m_sb.sb_logblocks;
+
+ be32_add_cpu(&agf->agf_freeblks, -logblocks);
+ agf->agf_longest = cpu_to_be32(id->agsize -
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
+ }
+}
+
+static void
+xfs_agflblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno;
+ int bucket;
+
+ if (xfs_has_crc(mp)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(id->agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+ }
+
+ agfl_bno = xfs_buf_to_agfl_bno(bp);
+ for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
+
+static void
+xfs_agiblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agi *agi = bp->b_addr;
+ int bucket;
+
+ agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+ agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+ agi->agi_seqno = cpu_to_be32(id->agno);
+ agi->agi_length = cpu_to_be32(id->agsize);
+ agi->agi_count = 0;
+ agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ agi->agi_level = cpu_to_be32(1);
+ agi->agi_freecount = 0;
+ agi->agi_newino = cpu_to_be32(NULLAGINO);
+ agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_has_crc(mp))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_has_finobt(mp)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ if (xfs_has_inobtcounts(mp)) {
+ agi->agi_iblocks = cpu_to_be32(1);
+ if (xfs_has_finobt(mp))
+ agi->agi_fblocks = cpu_to_be32(1);
+ }
+}
+
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+ struct aghdr_init_data *id);
+static int
+xfs_ag_init_hdr(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id,
+ aghdr_init_work_f work,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops);
+ if (error)
+ return error;
+
+ (*work)(mp, bp, id);
+
+ xfs_buf_delwri_queue(bp, &id->buffer_list);
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+struct xfs_aghdr_grow_data {
+ xfs_daddr_t daddr;
+ size_t numblks;
+ const struct xfs_buf_ops *ops;
+ aghdr_init_work_f work;
+ xfs_btnum_t type;
+ bool need_init;
+};
+
+/*
+ * Prepare new AG headers to be written to disk. We use uncached buffers here,
+ * as it is assumed these new AG headers are currently beyond the currently
+ * valid filesystem address space. Using cached buffers would trip over EOFS
+ * corruption detection alogrithms in the buffer cache lookup routines.
+ *
+ * This is a non-transactional function, but the prepared buffers are added to a
+ * delayed write buffer list supplied by the caller so they can submit them to
+ * disk and wait on them as required.
+ */
+int
+xfs_ag_init_headers(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id)
+
+{
+ struct xfs_aghdr_grow_data aghdr_data[] = {
+ { /* SB */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_sb_buf_ops,
+ .work = &xfs_sbblock_init,
+ .need_init = true
+ },
+ { /* AGF */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agf_buf_ops,
+ .work = &xfs_agfblock_init,
+ .need_init = true
+ },
+ { /* AGFL */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agfl_buf_ops,
+ .work = &xfs_agflblock_init,
+ .need_init = true
+ },
+ { /* AGI */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agi_buf_ops,
+ .work = &xfs_agiblock_init,
+ .need_init = true
+ },
+ { /* BNO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_bnobt_buf_ops,
+ .work = &xfs_bnoroot_init,
+ .need_init = true
+ },
+ { /* CNT root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_cntbt_buf_ops,
+ .work = &xfs_cntroot_init,
+ .need_init = true
+ },
+ { /* INO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_inobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_INO,
+ .need_init = true
+ },
+ { /* FINO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_finobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_FINO,
+ .need_init = xfs_has_finobt(mp)
+ },
+ { /* RMAP root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_rmapbt_buf_ops,
+ .work = &xfs_rmaproot_init,
+ .need_init = xfs_has_rmapbt(mp)
+ },
+ { /* REFC root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_refcountbt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_REFC,
+ .need_init = xfs_has_reflink(mp)
+ },
+ { /* NULL terminating block */
+ .daddr = XFS_BUF_DADDR_NULL,
+ }
+ };
+ struct xfs_aghdr_grow_data *dp;
+ int error = 0;
+
+ /* Account for AG free space in new AG */
+ id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+ for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+ if (!dp->need_init)
+ continue;
+
+ id->daddr = dp->daddr;
+ id->numblks = dp->numblks;
+ id->type = dp->type;
+ error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
+ if (error)
+ break;
+ }
+ return error;
+}
+
+int
+xfs_ag_shrink_space(
+ struct xfs_perag *pag,
+ struct xfs_trans **tpp,
+ xfs_extlen_t delta)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_alloc_arg args = {
+ .tp = *tpp,
+ .mp = mp,
+ .type = XFS_ALLOCTYPE_THIS_BNO,
+ .minlen = delta,
+ .maxlen = delta,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .resv = XFS_AG_RESV_NONE,
+ .prod = 1
+ };
+ struct xfs_buf *agibp, *agfbp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ xfs_agblock_t aglen;
+ int error, err2;
+
+ ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
+ if (error)
+ return error;
+
+ agi = agibp->b_addr;
+
+ error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
+ if (error)
+ return error;
+
+ agf = agfbp->b_addr;
+ aglen = be32_to_cpu(agi->agi_length);
+ /* some extra paranoid checks before we shrink the ag */
+ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+ return -EFSCORRUPTED;
+ if (delta >= aglen)
+ return -EINVAL;
+
+ args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
+
+ /*
+ * Make sure that the last inode cluster cannot overlap with the new
+ * end of the AG, even if it's sparse.
+ */
+ error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+ aglen - delta);
+ if (error)
+ return error;
+
+ /*
+ * Disable perag reservations so it doesn't cause the allocation request
+ * to fail. We'll reestablish reservation before we return.
+ */
+ error = xfs_ag_resv_free(pag);
+ if (error)
+ return error;
+
+ /* internal log shouldn't also show up in the free space btrees */
+ error = xfs_alloc_vextent(&args);
+ if (!error && args.agbno == NULLAGBLOCK)
+ error = -ENOSPC;
+
+ if (error) {
+ /*
+ * if extent allocation fails, need to roll the transaction to
+ * ensure that the AGFL fixup has been committed anyway.
+ */
+ xfs_trans_bhold(*tpp, agfbp);
+ err2 = xfs_trans_roll(tpp);
+ if (err2)
+ return err2;
+ xfs_trans_bjoin(*tpp, agfbp);
+ goto resv_init_out;
+ }
+
+ /*
+ * if successfully deleted from freespace btrees, need to confirm
+ * per-AG reservation works as expected.
+ */
+ be32_add_cpu(&agi->agi_length, -delta);
+ be32_add_cpu(&agf->agf_length, -delta);
+
+ err2 = xfs_ag_resv_init(pag, *tpp);
+ if (err2) {
+ be32_add_cpu(&agi->agi_length, delta);
+ be32_add_cpu(&agf->agf_length, delta);
+ if (err2 != -ENOSPC)
+ goto resv_err;
+
+ __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
+
+ /*
+ * Roll the transaction before trying to re-init the per-ag
+ * reservation. The new transaction is clean so it will cancel
+ * without any side effects.
+ */
+ error = xfs_defer_finish(tpp);
+ if (error)
+ return error;
+
+ error = -ENOSPC;
+ goto resv_init_out;
+ }
+ xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
+ xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
+ return 0;
+
+resv_init_out:
+ err2 = xfs_ag_resv_init(pag, *tpp);
+ if (!err2)
+ return error;
+resv_err:
+ xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return err2;
+}
+
+/*
+ * Extent the AG indicated by the @id by the length passed in
+ */
+int
+xfs_ag_extend_space(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ xfs_extlen_t len)
+{
+ struct xfs_buf *bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ int error;
+
+ ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+ error = xfs_ialloc_read_agi(pag, tp, &bp);
+ if (error)
+ return error;
+
+ agi = bp->b_addr;
+ be32_add_cpu(&agi->agi_length, len);
+ xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
+ /*
+ * Change agf length.
+ */
+ error = xfs_alloc_read_agf(pag, tp, 0, &bp);
+ if (error)
+ return error;
+
+ agf = bp->b_addr;
+ be32_add_cpu(&agf->agf_length, len);
+ ASSERT(agf->agf_length == agi->agi_length);
+ xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
+ /*
+ * Free the new space.
+ *
+ * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
+ * this doesn't actually exist in the rmap btree.
+ */
+ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE);
+ if (error)
+ return error;
+
+ error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
+ be32_to_cpu(agf->agf_length) - len),
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /* Update perag geometry */
+ pag->block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ return 0;
+}
+
+/* Retrieve AG geometry. */
+int
+xfs_ag_get_geometry(
+ struct xfs_perag *pag,
+ struct xfs_ag_geometry *ageo)
+{
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ unsigned int freeblks;
+ int error;
+
+ /* Lock the AG headers. */
+ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
+ if (error)
+ return error;
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
+ if (error)
+ goto out_agi;
+
+ /* Fill out form. */
+ memset(ageo, 0, sizeof(*ageo));
+ ageo->ag_number = pag->pag_agno;
+
+ agi = agi_bp->b_addr;
+ ageo->ag_icount = be32_to_cpu(agi->agi_count);
+ ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
+
+ agf = agf_bp->b_addr;
+ ageo->ag_length = be32_to_cpu(agf->agf_length);
+ freeblks = pag->pagf_freeblks +
+ pag->pagf_flcount +
+ pag->pagf_btreeblks -
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
+ ageo->ag_freeblks = freeblks;
+ xfs_ag_geom_health(pag, ageo);
+
+ /* Release resources. */
+ xfs_buf_relse(agf_bp);
+out_agi:
+ xfs_buf_relse(agi_bp);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000..191b22b9a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __LIBXFS_AG_H
+#define __LIBXFS_AG_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_perag;
+
+/*
+ * Per-ag infrastructure
+ */
+
+/* per-AG block reservation data structures*/
+struct xfs_ag_resv {
+ /* number of blocks originally reserved here */
+ xfs_extlen_t ar_orig_reserved;
+ /* number of blocks reserved here */
+ xfs_extlen_t ar_reserved;
+ /* number of blocks originally asked for */
+ xfs_extlen_t ar_asked;
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection.
+ */
+struct xfs_perag {
+ struct xfs_mount *pag_mount; /* owner filesystem */
+ xfs_agnumber_t pag_agno; /* AG this structure belongs to */
+ atomic_t pag_ref; /* perag reference count */
+ char pagf_init; /* this agf's entry is initialized */
+ char pagi_init; /* this agi's entry is initialized */
+ char pagf_metadata; /* the agf is preferred to be metadata */
+ char pagi_inodeok; /* The agi is ok for inodes */
+ uint8_t pagf_levels[XFS_BTNUM_AGF];
+ /* # of levels in bno & cnt btree */
+ bool pagf_agflreset; /* agfl requires reset before use */
+ uint32_t pagf_flcount; /* count of blocks in freelist */
+ xfs_extlen_t pagf_freeblks; /* total free blocks */
+ xfs_extlen_t pagf_longest; /* longest free space */
+ uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ xfs_agino_t pagi_freecount; /* number of free inodes */
+ xfs_agino_t pagi_count; /* number of allocated inodes */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
+
+ int pagb_count; /* pagb slots in use */
+ uint8_t pagf_refcount_level; /* recount btree height */
+
+ /* Blocks reserved for all kinds of metadata. */
+ struct xfs_ag_resv pag_meta_resv;
+ /* Blocks reserved for the reverse mapping btree. */
+ struct xfs_ag_resv pag_rmapbt_resv;
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+
+ /* Precalculated geometry info */
+ xfs_agblock_t block_count;
+ xfs_agblock_t min_block;
+ xfs_agino_t agino_min;
+ xfs_agino_t agino_max;
+
+#ifdef __KERNEL__
+ /* -- kernel only structures below this line -- */
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold pag_state_lock before accessing this field.
+ */
+ uint16_t pag_checked;
+ uint16_t pag_sick;
+ spinlock_t pag_state_lock;
+
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
+ unsigned int pagb_gen; /* generation count for pagb_tree */
+ wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
+
+ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
+
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
+ struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
+ struct rhashtable pag_buf_hash;
+
+ /* background prealloc block trimming */
+ struct delayed_work pag_blockgc_work;
+
+#endif /* __KERNEL__ */
+};
+
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
+ xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
+int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_free_perag(struct xfs_mount *mp);
+
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int tag);
+void xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+ if (agbno >= pag->block_count)
+ return false;
+ if (agbno <= pag->min_block)
+ return false;
+ return true;
+}
+
+static inline bool
+xfs_verify_agbext(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_agblock_t len)
+{
+ if (agbno + len <= agbno)
+ return false;
+
+ if (!xfs_verify_agbno(pag, agbno))
+ return false;
+
+ return xfs_verify_agbno(pag, agbno + len - 1);
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino < pag->agino_min)
+ return false;
+ if (agino > pag->agino_max)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino == NULLAGINO)
+ return true;
+ return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
+/*
+ * Perag iteration APIs
+ */
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t end_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_put(pag);
+ if (*agno > end_agno)
+ return NULL;
+ return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
+
+#define for_each_perag_from(mp, agno, pag) \
+ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
+
+
+#define for_each_perag(mp, agno, pag) \
+ (agno) = 0; \
+ for_each_perag_from((mp), (agno), (pag))
+
+#define for_each_perag_tag(mp, agno, pag, tag) \
+ for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ (pag) != NULL; \
+ (agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get_tag((mp), (agno), (tag)))
+
+struct aghdr_init_data {
+ /* per ag data */
+ xfs_agblock_t agno; /* ag to init */
+ xfs_extlen_t agsize; /* new AG size */
+ struct list_head buffer_list; /* buffer writeback list */
+ xfs_rfsblock_t nfree; /* cumulative new free space */
+
+ /* per header data */
+ xfs_daddr_t daddr; /* header location */
+ size_t numblks; /* size of header */
+ xfs_btnum_t type; /* type of btree root block */
+};
+
+int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+ xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+ xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+
+#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000..5af123d13
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG. However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion. Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior: First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken. Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting. Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations. Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions. In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function. It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction. The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks. Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks? For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t avail;
+ xfs_extlen_t orig;
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
+ orig = pag->pag_meta_resv.ar_asked;
+ break;
+ case XFS_AG_RESV_RMAPBT:
+ avail = pag->pagf_freeblks + pag->pagf_flcount -
+ pag->pag_meta_resv.ar_reserved;
+ orig = pag->pag_rmapbt_resv.ar_asked;
+ break;
+ default:
+ ASSERT(0);
+ return false;
+ }
+
+ trace_xfs_ag_resv_critical(pag, type, avail);
+
+ /* Critically low if less than 10% or max btree height remains. */
+ return XFS_TEST_ERROR(avail < orig / 10 ||
+ avail < pag->pag_mount->m_agbtree_maxlevels,
+ pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t len;
+
+ len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_RMAPBT:
+ len -= xfs_perag_resv(pag, type)->ar_reserved;
+ break;
+ case XFS_AG_RESV_NONE:
+ /* empty */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ trace_xfs_ag_resv_needed(pag, type, len);
+
+ return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t oldresv;
+ int error;
+
+ trace_xfs_ag_resv_free(pag, type, 0);
+
+ resv = xfs_perag_resv(pag, type);
+ if (pag->pag_agno == 0)
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ /*
+ * RMAPBT blocks come from the AGFL and AGFL blocks are always
+ * considered "free", so whatever was reserved at mount time must be
+ * given back at umount.
+ */
+ if (type == XFS_AG_RESV_RMAPBT)
+ oldresv = resv->ar_orig_reserved;
+ else
+ oldresv = resv->ar_reserved;
+ error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ resv->ar_reserved = 0;
+ resv->ar_asked = 0;
+ resv->ar_orig_reserved = 0;
+
+ if (error)
+ trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+ struct xfs_perag *pag)
+{
+ int error;
+ int err2;
+
+ error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+ err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+ if (err2 && !error)
+ error = err2;
+ return error;
+}
+
+static int
+__xfs_ag_resv_init(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ xfs_extlen_t ask,
+ xfs_extlen_t used)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_resv *resv;
+ int error;
+ xfs_extlen_t hidden_space;
+
+ if (used > ask)
+ ask = used;
+
+ switch (type) {
+ case XFS_AG_RESV_RMAPBT:
+ /*
+ * Space taken by the rmapbt is not subtracted from fdblocks
+ * because the rmapbt lives in the free space. Here we must
+ * subtract the entire reservation from fdblocks so that we
+ * always have blocks available for rmapbt expansion.
+ */
+ hidden_space = ask;
+ break;
+ case XFS_AG_RESV_METADATA:
+ /*
+ * Space taken by all other metadata btrees are accounted
+ * on-disk as used space. We therefore only hide the space
+ * that is reserved but not used by the trees.
+ */
+ hidden_space = ask - used;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+ error = -ENOSPC;
+ else
+ error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+ if (error) {
+ trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+ xfs_warn(mp,
+"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
+ pag->pag_agno);
+ return error;
+ }
+
+ /*
+ * Reduce the maximum per-AG allocation length by however much we're
+ * trying to reserve for an AG. Since this is a filesystem-wide
+ * counter, we only make the adjustment for AG 0. This assumes that
+ * there aren't any AGs hungrier for per-AG reservation than AG 0.
+ */
+ if (pag->pag_agno == 0)
+ mp->m_ag_max_usable -= ask;
+
+ resv = xfs_perag_resv(pag, type);
+ resv->ar_asked = ask;
+ resv->ar_orig_reserved = hidden_space;
+ resv->ar_reserved = ask - used;
+
+ trace_xfs_ag_resv_init(pag, type, ask);
+ return 0;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_extlen_t ask;
+ xfs_extlen_t used;
+ int error = 0, error2;
+ bool has_resv = false;
+
+ /* Create the metadata reservation. */
+ if (pag->pag_meta_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
+ if (error)
+ goto out;
+
+ error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error) {
+ /*
+ * Because we didn't have per-AG reservations when the
+ * finobt feature was added we might not be able to
+ * reserve all needed blocks. Warn and fall back to the
+ * old and potentially buggy code in that case, but
+ * ensure we do have the reservation for the refcountbt.
+ */
+ ask = used = 0;
+
+ mp->m_finobt_nores = true;
+
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
+ &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error)
+ goto out;
+ }
+ if (ask)
+ has_resv = true;
+ }
+
+ /* Create the RMAPBT metadata reservation */
+ if (pag->pag_rmapbt_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
+ if (error)
+ goto out;
+ if (ask)
+ has_resv = true;
+ }
+
+out:
+ /*
+ * Initialize the pagf if we have at least one active reservation on the
+ * AG. This may have occurred already via reservation calculation, but
+ * fall back to an explicit init to ensure the in-core allocbt usage
+ * counters are initialized as soon as possible. This is important
+ * because filesystems with large perag reservations are susceptible to
+ * free space reservation problems that the allocbt counter is used to
+ * address.
+ */
+ if (has_resv) {
+ error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
+ if (error2)
+ return error2;
+
+ /*
+ * If there isn't enough space in the AG to satisfy the
+ * reservation, let the caller know that there wasn't enough
+ * space. Callers are responsible for deciding what to do
+ * next, since (in theory) we can stumble along with
+ * insufficient reservation if data blocks are being freed to
+ * replenish the AG's free space.
+ */
+ if (!error &&
+ xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+ xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
+ pag->pagf_freeblks + pag->pagf_flcount)
+ error = -ENOSPC;
+ }
+
+ return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t len;
+ uint field;
+
+ trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+ switch (type) {
+ case XFS_AG_RESV_AGFL:
+ return;
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_RMAPBT:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ fallthrough;
+ case XFS_AG_RESV_NONE:
+ field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS;
+ xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+ return;
+ }
+
+ len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+ resv->ar_reserved -= len;
+ if (type == XFS_AG_RESV_RMAPBT)
+ return;
+ /* Allocations of reserved blocks only need on-disk sb updates... */
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+ /* ...but non-reserved blocks need in-core and on-disk updates. */
+ if (args->len > len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+ -((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_trans *tp,
+ xfs_extlen_t len)
+{
+ xfs_extlen_t leftover;
+ struct xfs_ag_resv *resv;
+
+ trace_xfs_ag_resv_free_extent(pag, type, len);
+
+ switch (type) {
+ case XFS_AG_RESV_AGFL:
+ return;
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_RMAPBT:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ fallthrough;
+ case XFS_AG_RESV_NONE:
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ return;
+ }
+
+ leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+ resv->ar_reserved += leftover;
+ if (type == XFS_AG_RESV_RMAPBT)
+ return;
+ /* Freeing into the reserved pool only requires on-disk update... */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+ /* ...but freeing beyond that requires in-core and on-disk update. */
+ if (len > leftover)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000..b74b21000
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_AG_RESV_H__
+#define __XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+ enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_trans *tp, xfs_extlen_t len);
+
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ return &pag->pag_meta_resv;
+ case XFS_AG_RESV_RMAPBT:
+ return &pag->pag_rmapbt_resv;
+ default:
+ return NULL;
+ }
+}
+
+/*
+ * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
+ * the AGFL, they are allocated one at a time and the reservation updates don't
+ * require a transaction.
+ */
+static inline void
+xfs_ag_resv_rmapbt_alloc(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_alloc_arg args = { NULL };
+ struct xfs_perag *pag;
+
+ args.len = 1;
+ pag = xfs_perag_get(mp, agno);
+ xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
+ xfs_perag_put(pag);
+}
+
+#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000..de79f5d07
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,3568 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_bmap.h"
+
+struct kmem_cache *xfs_extfree_item_cache;
+
+struct workqueue_struct *xfs_alloc_wq;
+
+#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define XFSA_FIXUP_BNO_OK 1
+#define XFSA_FIXUP_CNT_OK 2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+
+/*
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
+ * the beginning of the block for a proper header with the location information
+ * and CRC.
+ */
+unsigned int
+xfs_agfl_size(
+ struct xfs_mount *mp)
+{
+ unsigned int size = mp->m_sb.sb_sectsize;
+
+ if (xfs_has_crc(mp))
+ size -= sizeof(struct xfs_agfl);
+
+ return size / sizeof(xfs_agblock_t);
+}
+
+unsigned int
+xfs_refc_block(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_rmapbt(mp))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_has_finobt(mp))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
+xfs_extlen_t
+xfs_prealloc_blocks(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_reflink(mp))
+ return xfs_refc_block(mp) + 1;
+ if (xfs_has_rmapbt(mp))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_has_finobt(mp))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
+/*
+ * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * guarantee that we can refill the AGFL prior to allocating space in a nearly
+ * full AG. Although the space described by the free space btrees, the
+ * blocks used by the freesp btrees themselves, and the blocks owned by the
+ * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
+ * free space in the AG drop so low that the free space btrees cannot refill an
+ * empty AGFL up to the minimum level. Rather than grind through empty AGs
+ * until the fs goes down, we subtract this many AG blocks from the incore
+ * fdblocks to ensure user allocation does not overcommit the space the
+ * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
+ * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ */
+#define XFS_ALLOCBT_AGFL_RESERVE 4
+
+/*
+ * Compute the number of blocks that we set aside to guarantee the ability to
+ * refill the AGFL and handle a full bmap btree split.
+ *
+ * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
+ * AGF buffer (PV 947395), we place constraints on the relationship among
+ * actual allocations for data blocks, freelist blocks, and potential file data
+ * bmap btree blocks. However, these restrictions may result in no actual space
+ * allocated for a delayed extent, for example, a data block in a certain AG is
+ * allocated but there is no additional block for the additional bmap btree
+ * block due to a split of the bmap btree of the file. The result of this may
+ * lead to an infinite loop when the file gets flushed to disk and all delayed
+ * extents need to be actually allocated. To get around this, we explicitly set
+ * aside a few blocks which will not be reserved in delayed allocation.
+ *
+ * For each AG, we need to reserve enough blocks to replenish a totally empty
+ * AGFL and 4 more to handle a potential split of the file's bmap btree.
+ */
+unsigned int
+xfs_alloc_set_aside(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
+}
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ * - the AG superblock, AGF, AGI and AGFL
+ * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
+ * the AGI free inode and rmap btree root blocks.
+ * - blocks on the AGFL according to xfs_alloc_set_aside() limits
+ * - the rmapbt root block
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+unsigned int
+xfs_alloc_ag_max_usable(
+ struct xfs_mount *mp)
+{
+ unsigned int blocks;
+
+ blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
+ blocks += XFS_ALLOCBT_AGFL_RESERVE;
+ blocks += 3; /* AGF, AGI btree root blocks */
+ if (xfs_has_finobt(mp))
+ blocks++; /* finobt root block */
+ if (xfs_has_rmapbt(mp))
+ blocks++; /* rmap root block */
+ if (xfs_has_reflink(mp))
+ blocks++; /* refcount root block */
+
+ return mp->m_sb.sb_agblocks - blocks;
+}
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_eq(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ int error;
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
+}
+
+static inline bool
+xfs_alloc_cur_active(
+ struct xfs_btree_cur *cur)
+{
+ return cur && cur->bc_ag.abt.active;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_alloc_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ union xfs_btree_rec rec;
+
+ rec.alloc.ar_startblock = cpu_to_be32(bno);
+ rec.alloc.ar_blockcount = cpu_to_be32(len);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat) /* output: success/failure */
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !(*stat))
+ return error;
+
+ *bno = be32_to_cpu(rec->alloc.ar_startblock);
+ *len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+ if (*len == 0)
+ goto out_bad_rec;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_agbext(pag, *bno, *len))
+ goto out_bad_rec;
+
+ return 0;
+
+out_bad_rec:
+ xfs_warn(mp,
+ "%s Freespace BTree record corruption in AG %d detected!",
+ cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+ pag->pag_agno);
+ xfs_warn(mp,
+ "start block 0x%x block count 0x%x", *bno, *len);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC bool
+xfs_alloc_compute_aligned(
+ xfs_alloc_arg_t *args, /* allocation argument structure */
+ xfs_agblock_t foundbno, /* starting block in found extent */
+ xfs_extlen_t foundlen, /* length in found extent */
+ xfs_agblock_t *resbno, /* result block number */
+ xfs_extlen_t *reslen, /* result length */
+ unsigned *busy_gen)
+{
+ xfs_agblock_t bno = foundbno;
+ xfs_extlen_t len = foundlen;
+ xfs_extlen_t diff;
+ bool busy;
+
+ /* Trim busy sections out of found extent */
+ busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
+
+ /*
+ * If we have a largish extent that happens to start before min_agbno,
+ * see if we can shift it into range...
+ */
+ if (bno < args->min_agbno && bno + len > args->min_agbno) {
+ diff = args->min_agbno - bno;
+ if (len > diff) {
+ bno += diff;
+ len -= diff;
+ }
+ }
+
+ if (args->alignment > 1 && len >= args->minlen) {
+ xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
+
+ diff = aligned_bno - bno;
+
+ *resbno = aligned_bno;
+ *reslen = diff >= len ? 0 : len - diff;
+ } else {
+ *resbno = bno;
+ *reslen = len;
+ }
+
+ return busy;
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t /* difference value (absolute) */
+xfs_alloc_compute_diff(
+ xfs_agblock_t wantbno, /* target starting block */
+ xfs_extlen_t wantlen, /* target length */
+ xfs_extlen_t alignment, /* target alignment */
+ int datatype, /* are we allocating data? */
+ xfs_agblock_t freebno, /* freespace's starting block */
+ xfs_extlen_t freelen, /* freespace's length */
+ xfs_agblock_t *newbnop) /* result: best start block from free */
+{
+ xfs_agblock_t freeend; /* end of freespace extent */
+ xfs_agblock_t newbno1; /* return block number */
+ xfs_agblock_t newbno2; /* other new block number */
+ xfs_extlen_t newlen1=0; /* length with newbno1 */
+ xfs_extlen_t newlen2=0; /* length with newbno2 */
+ xfs_agblock_t wantend; /* end of target extent */
+ bool userdata = datatype & XFS_ALLOC_USERDATA;
+
+ ASSERT(freelen >= wantlen);
+ freeend = freebno + freelen;
+ wantend = wantbno + wantlen;
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
+ if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+ newbno1 = NULLAGBLOCK;
+ } else if (freeend >= wantend && alignment > 1) {
+ newbno1 = roundup(wantbno, alignment);
+ newbno2 = newbno1 - alignment;
+ if (newbno1 >= freeend)
+ newbno1 = NULLAGBLOCK;
+ else
+ newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+ if (newbno2 < freebno)
+ newbno2 = NULLAGBLOCK;
+ else
+ newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+ if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+ if (newlen1 < newlen2 ||
+ (newlen1 == newlen2 &&
+ XFS_ABSDIFF(newbno1, wantbno) >
+ XFS_ABSDIFF(newbno2, wantbno)))
+ newbno1 = newbno2;
+ } else if (newbno2 != NULLAGBLOCK)
+ newbno1 = newbno2;
+ } else if (freeend >= wantend) {
+ newbno1 = wantbno;
+ } else if (alignment > 1) {
+ newbno1 = roundup(freeend - wantlen, alignment);
+ if (newbno1 > freeend - wantlen &&
+ newbno1 - alignment >= freebno)
+ newbno1 -= alignment;
+ else if (newbno1 >= freeend)
+ newbno1 = NULLAGBLOCK;
+ } else
+ newbno1 = freeend - wantlen;
+ *newbnop = newbno1;
+ return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_extlen_t k;
+ xfs_extlen_t rlen;
+
+ ASSERT(args->mod < args->prod);
+ rlen = args->len;
+ ASSERT(rlen >= args->minlen);
+ ASSERT(rlen <= args->maxlen);
+ if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+ (args->mod == 0 && rlen < args->prod))
+ return;
+ k = rlen % args->prod;
+ if (k == args->mod)
+ return;
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ /* casts to (int) catch length underflows */
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
+ ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+ rlen + args->minleft);
+ args->len = rlen;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks. The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int /* error code */
+xfs_alloc_fixup_trees(
+ struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
+ struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
+ xfs_agblock_t fbno, /* starting block of free extent */
+ xfs_extlen_t flen, /* length of free extent */
+ xfs_agblock_t rbno, /* starting block of returned extent */
+ xfs_extlen_t rlen, /* length of returned extent */
+ int flags) /* flags, XFSA_FIXUP_... */
+{
+ int error; /* error code */
+ int i; /* operation results */
+ xfs_agblock_t nfbno1; /* first new free startblock */
+ xfs_agblock_t nfbno2; /* second new free startblock */
+ xfs_extlen_t nflen1=0; /* first new free length */
+ xfs_extlen_t nflen2=0; /* second new free length */
+ struct xfs_mount *mp;
+
+ mp = cnt_cur->bc_mp;
+
+ /*
+ * Look up the record in the by-size tree if necessary.
+ */
+ if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+ if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
+#endif
+ } else {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+ /*
+ * Look up the record in the by-block tree if necessary.
+ */
+ if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+ if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
+#endif
+ } else {
+ if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+
+#ifdef DEBUG
+ if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+ struct xfs_btree_block *bnoblock;
+ struct xfs_btree_block *cntblock;
+
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
+
+ if (XFS_IS_CORRUPT(mp,
+ bnoblock->bb_numrecs !=
+ cntblock->bb_numrecs))
+ return -EFSCORRUPTED;
+ }
+#endif
+
+ /*
+ * Deal with all four cases: the allocated record is contained
+ * within the freespace record, so we can have new freespace
+ * at either (or both) end, or no freespace remaining.
+ */
+ if (rbno == fbno && rlen == flen)
+ nfbno1 = nfbno2 = NULLAGBLOCK;
+ else if (rbno == fbno) {
+ nfbno1 = rbno + rlen;
+ nflen1 = flen - rlen;
+ nfbno2 = NULLAGBLOCK;
+ } else if (rbno + rlen == fbno + flen) {
+ nfbno1 = fbno;
+ nflen1 = flen - rlen;
+ nfbno2 = NULLAGBLOCK;
+ } else {
+ nfbno1 = fbno;
+ nflen1 = rbno - fbno;
+ nfbno2 = rbno + rlen;
+ nflen2 = (fbno + flen) - nfbno2;
+ }
+ /*
+ * Delete the entry from the by-size btree.
+ */
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ /*
+ * Add new by-size btree entry(s).
+ */
+ if (nfbno1 != NULLAGBLOCK) {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+ if (nfbno2 != NULLAGBLOCK) {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+ /*
+ * Fix up the by-block btree entry(s).
+ */
+ if (nfbno1 == NULLAGBLOCK) {
+ /*
+ * No remaining freespace, just delete the by-block tree entry.
+ */
+ if ((error = xfs_btree_delete(bno_cur, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ } else {
+ /*
+ * Update the by-block entry to start later|be shorter.
+ */
+ if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+ return error;
+ }
+ if (nfbno2 != NULLAGBLOCK) {
+ /*
+ * 2 resulting free entries, need to add one.
+ */
+ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
+ if ((error = xfs_btree_insert(bno_cur, &i)))
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
+ int i;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_has_crc(mp))
+ return NULL;
+
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
+ return __this_address;
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ return __this_address;
+
+ for (i = 0; i < xfs_agfl_size(mp); i++) {
+ if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ return __this_address;
+ }
+
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
+ return __this_address;
+ return NULL;
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_agfl_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
+
+ /* no verification of non-crc AGFLs */
+ if (!xfs_has_crc(mp))
+ return;
+
+ fa = xfs_agfl_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (bip)
+ XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+ .verify_struct = xfs_agfl_verify,
+};
+
+/*
+ * Read in the allocation group free block array.
+ */
+int
+xfs_alloc_read_agfl(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_trans_read_buf(
+ mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+ if (error)
+ return error;
+ xfs_buf_set_ref(bp, XFS_AGFL_REF);
+ *bpp = bp;
+ return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ long len)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+
+ agbp->b_pag->pagf_freeblks += len;
+ be32_add_cpu(&agf->agf_freeblks, len);
+
+ if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+ be32_to_cpu(agf->agf_length))) {
+ xfs_buf_mark_corrupt(agbp);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+ return 0;
+}
+
+/*
+ * Block allocation algorithm and data structures.
+ */
+struct xfs_alloc_cur {
+ struct xfs_btree_cur *cnt; /* btree cursors */
+ struct xfs_btree_cur *bnolt;
+ struct xfs_btree_cur *bnogt;
+ xfs_extlen_t cur_len;/* current search length */
+ xfs_agblock_t rec_bno;/* extent startblock */
+ xfs_extlen_t rec_len;/* extent length */
+ xfs_agblock_t bno; /* alloc bno */
+ xfs_extlen_t len; /* alloc len */
+ xfs_extlen_t diff; /* diff from search bno */
+ unsigned int busy_gen;/* busy state */
+ bool busy;
+};
+
+/*
+ * Set up cursors, etc. in the extent allocation cursor. This function can be
+ * called multiple times to reset an initialized structure without having to
+ * reallocate cursors.
+ */
+static int
+xfs_alloc_cur_setup(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ int error;
+ int i;
+
+ ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
+
+ acur->cur_len = args->maxlen;
+ acur->rec_bno = 0;
+ acur->rec_len = 0;
+ acur->bno = 0;
+ acur->len = 0;
+ acur->diff = -1;
+ acur->busy = false;
+ acur->busy_gen = 0;
+
+ /*
+ * Perform an initial cntbt lookup to check for availability of maxlen
+ * extents. If this fails, we'll return -ENOSPC to signal the caller to
+ * attempt a small allocation.
+ */
+ if (!acur->cnt)
+ acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
+ if (error)
+ return error;
+
+ /*
+ * Allocate the bnobt left and right search cursors.
+ */
+ if (!acur->bnolt)
+ acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag, XFS_BTNUM_BNO);
+ if (!acur->bnogt)
+ acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag, XFS_BTNUM_BNO);
+ return i == 1 ? 0 : -ENOSPC;
+}
+
+static void
+xfs_alloc_cur_close(
+ struct xfs_alloc_cur *acur,
+ bool error)
+{
+ int cur_error = XFS_BTREE_NOERROR;
+
+ if (error)
+ cur_error = XFS_BTREE_ERROR;
+
+ if (acur->cnt)
+ xfs_btree_del_cursor(acur->cnt, cur_error);
+ if (acur->bnolt)
+ xfs_btree_del_cursor(acur->bnolt, cur_error);
+ if (acur->bnogt)
+ xfs_btree_del_cursor(acur->bnogt, cur_error);
+ acur->cnt = acur->bnolt = acur->bnogt = NULL;
+}
+
+/*
+ * Check an extent for allocation and track the best available candidate in the
+ * allocation structure. The cursor is deactivated if it has entered an out of
+ * range state based on allocation arguments. Optionally return the extent
+ * extent geometry and allocation status if requested by the caller.
+ */
+static int
+xfs_alloc_cur_check(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ int *new)
+{
+ int error, i;
+ xfs_agblock_t bno, bnoa, bnew;
+ xfs_extlen_t len, lena, diff = -1;
+ bool busy;
+ unsigned busy_gen = 0;
+ bool deactivate = false;
+ bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+
+ *new = 0;
+
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+
+ /*
+ * Check minlen and deactivate a cntbt cursor if out of acceptable size
+ * range (i.e., walking backwards looking for a minlen extent).
+ */
+ if (len < args->minlen) {
+ deactivate = !isbnobt;
+ goto out;
+ }
+
+ busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
+ &busy_gen);
+ acur->busy |= busy;
+ if (busy)
+ acur->busy_gen = busy_gen;
+ /* deactivate a bnobt cursor outside of locality range */
+ if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
+ deactivate = isbnobt;
+ goto out;
+ }
+ if (lena < args->minlen)
+ goto out;
+
+ args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ASSERT(args->len >= args->minlen);
+ if (args->len < acur->len)
+ goto out;
+
+ /*
+ * We have an aligned record that satisfies minlen and beats or matches
+ * the candidate extent size. Compare locality for near allocation mode.
+ */
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+ diff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->datatype,
+ bnoa, lena, &bnew);
+ if (bnew == NULLAGBLOCK)
+ goto out;
+
+ /*
+ * Deactivate a bnobt cursor with worse locality than the current best.
+ */
+ if (diff > acur->diff) {
+ deactivate = isbnobt;
+ goto out;
+ }
+
+ ASSERT(args->len > acur->len ||
+ (args->len == acur->len && diff <= acur->diff));
+ acur->rec_bno = bno;
+ acur->rec_len = len;
+ acur->bno = bnew;
+ acur->len = args->len;
+ acur->diff = diff;
+ *new = 1;
+
+ /*
+ * We're done if we found a perfect allocation. This only deactivates
+ * the current cursor, but this is just an optimization to terminate a
+ * cntbt search that otherwise runs to the edge of the tree.
+ */
+ if (acur->diff == 0 && acur->len == args->maxlen)
+ deactivate = true;
+out:
+ if (deactivate)
+ cur->bc_ag.abt.active = false;
+ trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
+ *new);
+ return 0;
+}
+
+/*
+ * Complete an allocation of a candidate extent. Remove the extent from both
+ * trees and update the args structure.
+ */
+STATIC int
+xfs_alloc_cur_finish(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+ int error;
+
+ ASSERT(acur->cnt && acur->bnolt);
+ ASSERT(acur->bno >= acur->rec_bno);
+ ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
+ ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
+
+ error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
+ acur->rec_len, acur->bno, acur->len, 0);
+ if (error)
+ return error;
+
+ args->agbno = acur->bno;
+ args->len = acur->len;
+ args->wasfromfl = 0;
+
+ trace_xfs_alloc_cur(args);
+ return 0;
+}
+
+/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_cntbt_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_btree_cur *cur = acur->cnt;
+ xfs_agblock_t bno;
+ xfs_extlen_t len, cur_len;
+ int error;
+ int i;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ /* locality optimized lookup */
+ cur_len = acur->cur_len;
+ error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ return 0;
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+
+ /* check the current record and update search length from it */
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
+ if (error)
+ return error;
+ ASSERT(len >= acur->cur_len);
+ acur->cur_len = len;
+
+ /*
+ * We looked up the first record >= [agbno, len] above. The agbno is a
+ * secondary key and so the current record may lie just before or after
+ * agbno. If it is past agbno, check the previous record too so long as
+ * the length matches as it may be closer. Don't check a smaller record
+ * because that could deactivate our cursor.
+ */
+ if (bno > args->agbno) {
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (!error && i) {
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (!error && i && len == acur->cur_len)
+ error = xfs_alloc_cur_check(args, acur, cur,
+ &i);
+ }
+ if (error)
+ return error;
+ }
+
+ /*
+ * Increment the search key until we find at least one allocation
+ * candidate or if the extent we found was larger. Otherwise, double the
+ * search key to optimize the search. Efficiency is more important here
+ * than absolute best locality.
+ */
+ cur_len <<= 1;
+ if (!acur->len || acur->cur_len >= cur_len)
+ acur->cur_len++;
+ else
+ acur->cur_len = cur_len;
+
+ return error;
+}
+
+/*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_small(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur *ccur, /* optional by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
+{
+ struct xfs_agf *agf = args->agbp->b_addr;
+ int error = 0;
+ xfs_agblock_t fbno = NULLAGBLOCK;
+ xfs_extlen_t flen = 0;
+ int i = 0;
+
+ /*
+ * If a cntbt cursor is provided, try to allocate the largest record in
+ * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+ * allocation. Make sure to respect minleft even when pulling from the
+ * freelist.
+ */
+ if (ccur)
+ error = xfs_btree_decrement(ccur, 0, &i);
+ if (error)
+ goto error;
+ if (i) {
+ error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ goto out;
+ }
+
+ if (args->minlen != 1 || args->alignment != 1 ||
+ args->resv == XFS_AG_RESV_AGFL ||
+ be32_to_cpu(agf->agf_flcount) <= args->minleft)
+ goto out;
+
+ error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+ &fbno, 0);
+ if (error)
+ goto error;
+ if (fbno == NULLAGBLOCK)
+ goto out;
+
+ xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
+ (args->datatype & XFS_ALLOC_NOBUSY));
+
+ if (args->datatype & XFS_ALLOC_USERDATA) {
+ struct xfs_buf *bp;
+
+ error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ args->mp->m_bsize, 0, &bp);
+ if (error)
+ goto error;
+ xfs_trans_binval(args->tp, bp);
+ }
+ *fbnop = args->agbno = fbno;
+ *flenp = args->len = 1;
+ if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that doesn't live in the
+ * free space, we need to clear out the OWN_AG rmap.
+ */
+ error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ goto error;
+
+ *stat = 0;
+ return 0;
+
+out:
+ /*
+ * Can't do the allocation, give up.
+ */
+ if (flen < args->minlen) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_small_notenough(args);
+ flen = 0;
+ }
+ *fbnop = fbno;
+ *flenp = flen;
+ *stat = 1;
+ trace_xfs_alloc_small_done(args);
+ return 0;
+
+error:
+ trace_xfs_alloc_small_error(args);
+ return error;
+}
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent(
+ xfs_alloc_arg_t *args) /* argument structure for allocation */
+{
+ int error=0;
+
+ ASSERT(args->minlen > 0);
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->minlen <= args->maxlen);
+ ASSERT(args->mod < args->prod);
+ ASSERT(args->alignment > 0);
+
+ /*
+ * Branch to correct routine based on the type.
+ */
+ args->wasfromfl = 0;
+ switch (args->type) {
+ case XFS_ALLOCTYPE_THIS_AG:
+ error = xfs_alloc_ag_vextent_size(args);
+ break;
+ case XFS_ALLOCTYPE_NEAR_BNO:
+ error = xfs_alloc_ag_vextent_near(args);
+ break;
+ case XFS_ALLOCTYPE_THIS_BNO:
+ error = xfs_alloc_ag_vextent_exact(args);
+ break;
+ default:
+ ASSERT(0);
+ /* NOTREACHED */
+ }
+
+ if (error || args->agbno == NULLAGBLOCK)
+ return error;
+
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
+ ASSERT(args->agbno % args->alignment == 0);
+
+ /* if not file data, insert new block into the reverse map btree */
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ return error;
+ }
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->agbp,
+ -((long)(args->len)));
+ if (error)
+ return error;
+
+ ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
+ args->agbno, args->len));
+ }
+
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
+
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
+ return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_exact(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+ struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
+ int error;
+ xfs_agblock_t fbno; /* start block of found extent */
+ xfs_extlen_t flen; /* length of found extent */
+ xfs_agblock_t tbno; /* start block of busy extent */
+ xfs_extlen_t tlen; /* length of busy extent */
+ xfs_agblock_t tend; /* end block of busy extent */
+ int i; /* success/failure of operation */
+ unsigned busy_gen;
+
+ ASSERT(args->alignment == 1);
+
+ /*
+ * Allocate/initialize a cursor for the by-number freespace btree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag, XFS_BTNUM_BNO);
+
+ /*
+ * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+ * Look for the closest free block <= bno, it must contain bno
+ * if any free block does.
+ */
+ error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+ if (error)
+ goto error0;
+ if (!i)
+ goto not_found;
+
+ /*
+ * Grab the freespace record.
+ */
+ error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(fbno <= args->agbno);
+
+ /*
+ * Check for overlapping busy extents.
+ */
+ tbno = fbno;
+ tlen = flen;
+ xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
+
+ /*
+ * Give up if the start of the extent is busy, or the freespace isn't
+ * long enough for the minimum request.
+ */
+ if (tbno > args->agbno)
+ goto not_found;
+ if (tlen < args->minlen)
+ goto not_found;
+ tend = tbno + tlen;
+ if (tend < args->agbno + args->minlen)
+ goto not_found;
+
+ /*
+ * End of extent will be smaller of the freespace end and the
+ * maximal requested end.
+ *
+ * Fix the length according to mod and prod if given.
+ */
+ args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+ - args->agbno;
+ xfs_alloc_fix_len(args);
+ ASSERT(args->agbno + args->len <= tend);
+
+ /*
+ * We are allocating agbno for args->len
+ * Allocate/initialize a cursor for the by-size btree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag, XFS_BTNUM_CNT);
+ ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
+ error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+ args->len, XFSA_FIXUP_BNO_OK);
+ if (error) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+ args->wasfromfl = 0;
+ trace_xfs_alloc_exact_done(args);
+ return 0;
+
+not_found:
+ /* Didn't find it, return null. */
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_exact_notfound(args);
+ return 0;
+
+error0:
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ trace_xfs_alloc_exact_error(args);
+ return error;
+}
+
+/*
+ * Search a given number of btree records in a given direction. Check each
+ * record against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_walk_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ bool increment,
+ bool find_one, /* quit on first candidate */
+ int count, /* rec count (-1 for infinite) */
+ int *stat)
+{
+ int error;
+ int i;
+
+ *stat = 0;
+
+ /*
+ * Search so long as the cursor is active or we find a better extent.
+ * The cursor is deactivated if it extends beyond the range of the
+ * current allocation candidate.
+ */
+ while (xfs_alloc_cur_active(cur) && count) {
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ *stat = 1;
+ if (find_one)
+ break;
+ }
+ if (!xfs_alloc_cur_active(cur))
+ break;
+
+ if (increment)
+ error = xfs_btree_increment(cur, 0, &i);
+ else
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ cur->bc_ag.abt.active = false;
+
+ if (count > 0)
+ count--;
+ }
+
+ return 0;
+}
+
+/*
+ * Search the by-bno and by-size btrees in parallel in search of an extent with
+ * ideal locality based on the NEAR mode ->agbno locality hint.
+ */
+STATIC int
+xfs_alloc_ag_vextent_locality(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ int *stat)
+{
+ struct xfs_btree_cur *fbcur = NULL;
+ int error;
+ int i;
+ bool fbinc;
+
+ ASSERT(acur->len == 0);
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+
+ *stat = 0;
+
+ error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
+ if (error)
+ return error;
+
+ /*
+ * Search the bnobt and cntbt in parallel. Search the bnobt left and
+ * right and lookup the closest extent to the locality hint for each
+ * extent size key in the cntbt. The entire search terminates
+ * immediately on a bnobt hit because that means we've found best case
+ * locality. Otherwise the search continues until the cntbt cursor runs
+ * off the end of the tree. If no allocation candidate is found at this
+ * point, give up on locality, walk backwards from the end of the cntbt
+ * and take the first available extent.
+ *
+ * The parallel tree searches balance each other out to provide fairly
+ * consistent performance for various situations. The bnobt search can
+ * have pathological behavior in the worst case scenario of larger
+ * allocation requests and fragmented free space. On the other hand, the
+ * bnobt is able to satisfy most smaller allocation requests much more
+ * quickly than the cntbt. The cntbt search can sift through fragmented
+ * free space and sets of free extents for larger allocation requests
+ * more quickly than the bnobt. Since the locality hint is just a hint
+ * and we don't want to scan the entire bnobt for perfect locality, the
+ * cntbt search essentially bounds the bnobt search such that we can
+ * find good enough locality at reasonable performance in most cases.
+ */
+ while (xfs_alloc_cur_active(acur->bnolt) ||
+ xfs_alloc_cur_active(acur->bnogt) ||
+ xfs_alloc_cur_active(acur->cnt)) {
+
+ trace_xfs_alloc_cur_lookup(args);
+
+ /*
+ * Search the bnobt left and right. In the case of a hit, finish
+ * the search in the opposite direction and we're done.
+ */
+ error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
+ true, 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_left(args);
+ fbcur = acur->bnogt;
+ fbinc = true;
+ break;
+ }
+ error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
+ 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_right(args);
+ fbcur = acur->bnolt;
+ fbinc = false;
+ break;
+ }
+
+ /*
+ * Check the extent with best locality based on the current
+ * extent size search key and keep track of the best candidate.
+ */
+ error = xfs_alloc_cntbt_iter(args, acur);
+ if (error)
+ return error;
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ trace_xfs_alloc_cur_lookup_done(args);
+ break;
+ }
+ }
+
+ /*
+ * If we failed to find anything due to busy extents, return empty
+ * handed so the caller can flush and retry. If no busy extents were
+ * found, walk backwards from the end of the cntbt as a last resort.
+ */
+ if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
+ error = xfs_btree_decrement(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ if (i) {
+ acur->cnt->bc_ag.abt.active = true;
+ fbcur = acur->cnt;
+ fbinc = false;
+ }
+ }
+
+ /*
+ * Search in the opposite direction for a better entry in the case of
+ * a bnobt hit or walk backwards from the end of the cntbt.
+ */
+ if (fbcur) {
+ error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
+ &i);
+ if (error)
+ return error;
+ }
+
+ if (acur->len)
+ *stat = 1;
+
+ return 0;
+}
+
+/* Check the last block of the cnt btree for allocations. */
+static int
+xfs_alloc_ag_vextent_lastblock(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ xfs_agblock_t *bno,
+ xfs_extlen_t *len,
+ bool *allocated)
+{
+ int error;
+ int i;
+
+#ifdef DEBUG
+ /* Randomly don't execute the first algorithm. */
+ if (prandom_u32_max(2))
+ return 0;
+#endif
+
+ /*
+ * Start from the entry that lookup found, sequence through all larger
+ * free blocks. If we're actually pointing at a record smaller than
+ * maxlen, go to the start of this block, and skip all those smaller
+ * than minlen.
+ */
+ if (*len || args->alignment > 1) {
+ acur->cnt->bc_levels[0].ptr = 1;
+ do {
+ error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+ if (*len >= args->minlen)
+ break;
+ error = xfs_btree_increment(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ } while (i);
+ ASSERT(*len >= args->minlen);
+ if (!i)
+ return 0;
+ }
+
+ error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
+ if (error)
+ return error;
+
+ /*
+ * It didn't work. We COULD be in a case where there's a good record
+ * somewhere, so try again.
+ */
+ if (acur->len == 0)
+ return 0;
+
+ trace_xfs_alloc_near_first(args);
+ *allocated = true;
+ return 0;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int
+xfs_alloc_ag_vextent_near(
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_alloc_cur acur = {};
+ int error; /* error code */
+ int i; /* result code, temporary */
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+
+ /* handle uninitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
+restart:
+ len = 0;
+
+ /*
+ * Set up cursors and see if there are any free extents as big as
+ * maxlen. If not, pick the last entry in the tree unless the tree is
+ * empty.
+ */
+ error = xfs_alloc_cur_setup(args, &acur);
+ if (error == -ENOSPC) {
+ error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
+ &len, &i);
+ if (error)
+ goto out;
+ if (i == 0 || len == 0) {
+ trace_xfs_alloc_near_noentry(args);
+ goto out;
+ }
+ ASSERT(i == 1);
+ } else if (error) {
+ goto out;
+ }
+
+ /*
+ * First algorithm.
+ * If the requested extent is large wrt the freespaces available
+ * in this a.g., then the cursor will be pointing to a btree entry
+ * near the right edge of the tree. If it's in the last btree leaf
+ * block, then we just examine all the entries in that block
+ * that are big enough, and pick the best one.
+ */
+ if (xfs_btree_islastblock(acur.cnt, 0)) {
+ bool allocated = false;
+
+ error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
+ &allocated);
+ if (error)
+ goto out;
+ if (allocated)
+ goto alloc_finish;
+ }
+
+ /*
+ * Second algorithm. Combined cntbt and bnobt search to find ideal
+ * locality.
+ */
+ error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
+ if (error)
+ goto out;
+
+ /*
+ * If we couldn't get anything, give up.
+ */
+ if (!acur.len) {
+ if (acur.busy) {
+ trace_xfs_alloc_near_busy(args);
+ xfs_extent_busy_flush(args->mp, args->pag,
+ acur.busy_gen);
+ goto restart;
+ }
+ trace_xfs_alloc_size_neither(args);
+ args->agbno = NULLAGBLOCK;
+ goto out;
+ }
+
+alloc_finish:
+ /* fix up btrees on a successful allocation */
+ error = xfs_alloc_cur_finish(args, &acur);
+
+out:
+ xfs_alloc_cur_close(&acur, error);
+ return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_size(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ struct xfs_agf *agf = args->agbp->b_addr;
+ struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
+ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
+ int error; /* error result */
+ xfs_agblock_t fbno; /* start of found freespace */
+ xfs_extlen_t flen; /* length of found freespace */
+ int i; /* temp status variable */
+ xfs_agblock_t rbno; /* returned block number */
+ xfs_extlen_t rlen; /* length of returned extent */
+ bool busy;
+ unsigned busy_gen;
+
+restart:
+ /*
+ * Allocate and initialize a cursor for the by-size btree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag, XFS_BTNUM_CNT);
+ bno_cur = NULL;
+
+ /*
+ * Look for an entry >= maxlen+alignment-1 blocks.
+ */
+ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+ args->maxlen + args->alignment - 1, &i)))
+ goto error0;
+
+ /*
+ * If none then we have to settle for a smaller extent. In the case that
+ * there are no large extents, this will return the last entry in the
+ * tree unless the tree is empty. In the case that there are only busy
+ * large extents, this will return the largest small extent unless there
+ * are no smaller extents available.
+ */
+ if (!i) {
+ error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+ &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ if (i == 0 || flen == 0) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_noentry(args);
+ return 0;
+ }
+ ASSERT(i == 1);
+ busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
+ &rlen, &busy_gen);
+ } else {
+ /*
+ * Search for a non-busy extent that is large enough.
+ */
+ for (;;) {
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ busy = xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen, &busy_gen);
+
+ if (rlen >= args->maxlen)
+ break;
+
+ error = xfs_btree_increment(cnt_cur, 0, &i);
+ if (error)
+ goto error0;
+ if (i == 0) {
+ /*
+ * Our only valid extents must have been busy.
+ * Make it unbusy by forcing the log out and
+ * retrying.
+ */
+ xfs_btree_del_cursor(cnt_cur,
+ XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ xfs_extent_busy_flush(args->mp,
+ args->pag, busy_gen);
+ goto restart;
+ }
+ }
+ }
+
+ /*
+ * In the first case above, we got the last entry in the
+ * by-size btree. Now we check to see if the space hits maxlen
+ * once aligned; if not, we search left for something better.
+ * This can't happen in the second case above.
+ */
+ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if (rlen < args->maxlen) {
+ xfs_agblock_t bestfbno;
+ xfs_extlen_t bestflen;
+ xfs_agblock_t bestrbno;
+ xfs_extlen_t bestrlen;
+
+ bestrlen = rlen;
+ bestrbno = rbno;
+ bestflen = flen;
+ bestfbno = fbno;
+ for (;;) {
+ if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+ goto error0;
+ if (i == 0)
+ break;
+ if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+ &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if (flen < bestrlen)
+ break;
+ busy = xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen, &busy_gen);
+ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if (rlen > bestrlen) {
+ bestrlen = rlen;
+ bestrbno = rbno;
+ bestflen = flen;
+ bestfbno = fbno;
+ if (rlen == args->maxlen)
+ break;
+ }
+ }
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+ &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ rlen = bestrlen;
+ rbno = bestrbno;
+ flen = bestflen;
+ fbno = bestfbno;
+ }
+ args->wasfromfl = 0;
+ /*
+ * Fix up the length.
+ */
+ args->len = rlen;
+ if (rlen < args->minlen) {
+ if (busy) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+ goto restart;
+ }
+ goto out_nominleft;
+ }
+ xfs_alloc_fix_len(args);
+
+ rlen = args->len;
+ if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Allocate and initialize a cursor for the by-block tree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag, XFS_BTNUM_BNO);
+ if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+ rbno, rlen, XFSA_FIXUP_CNT_OK)))
+ goto error0;
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ cnt_cur = bno_cur = NULL;
+ args->len = rlen;
+ args->agbno = rbno;
+ if (XFS_IS_CORRUPT(args->mp,
+ args->agbno + args->len >
+ be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ trace_xfs_alloc_size_done(args);
+ return 0;
+
+error0:
+ trace_xfs_alloc_size_error(args);
+ if (cnt_cur)
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ if (bno_cur)
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ return error;
+
+out_nominleft:
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_nominleft(args);
+ args->agbno = NULLAGBLOCK;
+ return 0;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int
+xfs_free_ag_extent(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t gtbno; /* start of right neighbor */
+ xfs_extlen_t gtlen; /* length of right neighbor */
+ xfs_agblock_t ltbno; /* start of left neighbor */
+ xfs_extlen_t ltlen; /* length of left neighbor */
+ xfs_agblock_t nbno; /* new starting block of freesp */
+ xfs_extlen_t nlen; /* new length of freespace */
+ int haveleft; /* have a left neighbor */
+ int haveright; /* have a right neighbor */
+ int i;
+ int error;
+ struct xfs_perag *pag = agbp->b_pag;
+
+ bno_cur = cnt_cur = NULL;
+ mp = tp->t_mountp;
+
+ if (!xfs_rmap_should_skip_owner_update(oinfo)) {
+ error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * Allocate and initialize a cursor for the by-block btree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+ /*
+ * Look for a neighboring block on the left (lower block numbers)
+ * that is contiguous with this space.
+ */
+ if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+ goto error0;
+ if (haveleft) {
+ /*
+ * There is a block to our left.
+ */
+ if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * It's not contiguous, though.
+ */
+ if (ltbno + ltlen < bno)
+ haveleft = 0;
+ else {
+ /*
+ * If this failure happens the request to free this
+ * space was invalid, it's (partly) already free.
+ * Very bad.
+ */
+ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+ }
+ /*
+ * Look for a neighboring block on the right (higher block numbers)
+ * that is contiguous with this space.
+ */
+ if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+ goto error0;
+ if (haveright) {
+ /*
+ * There is a block to our right.
+ */
+ if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * It's not contiguous, though.
+ */
+ if (bno + len < gtbno)
+ haveright = 0;
+ else {
+ /*
+ * If this failure happens the request to free this
+ * space was invalid, it's (partly) already free.
+ * Very bad.
+ */
+ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+ }
+ /*
+ * Now allocate and initialize a cursor for the by-size tree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+ /*
+ * Have both left and right contiguous neighbors.
+ * Merge all three into a single free block.
+ */
+ if (haveleft && haveright) {
+ /*
+ * Delete the old by-size entry on the left.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Delete the old by-size entry on the right.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Delete the old by-block entry for the right block.
+ */
+ if ((error = xfs_btree_delete(bno_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Move the by-block cursor back to the left neighbor.
+ */
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+#ifdef DEBUG
+ /*
+ * Check that this is the right record: delete didn't
+ * mangle the cursor.
+ */
+ {
+ xfs_agblock_t xxbno;
+ xfs_extlen_t xxlen;
+
+ if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+ &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ xxbno != ltbno ||
+ xxlen != ltlen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+#endif
+ /*
+ * Update remaining by-block entry to the new, joined block.
+ */
+ nbno = ltbno;
+ nlen = len + ltlen + gtlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * Have only a left contiguous neighbor.
+ * Merge it together with the new freespace.
+ */
+ else if (haveleft) {
+ /*
+ * Delete the old by-size entry on the left.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Back up the by-block cursor to the left neighbor, and
+ * update its length.
+ */
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ nbno = ltbno;
+ nlen = len + ltlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * Have only a right contiguous neighbor.
+ * Merge it together with the new freespace.
+ */
+ else if (haveright) {
+ /*
+ * Delete the old by-size entry on the right.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Update the starting block and length of the right
+ * neighbor in the by-block tree.
+ */
+ nbno = bno;
+ nlen = len + gtlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * No contiguous neighbors.
+ * Insert the new freespace into the by-block tree.
+ */
+ else {
+ nbno = bno;
+ nlen = len;
+ if ((error = xfs_btree_insert(bno_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ bno_cur = NULL;
+ /*
+ * In all cases we need to insert the new freespace in the by-size tree.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ cnt_cur = NULL;
+
+ /*
+ * Update the freespace totals in the ag and superblock.
+ */
+ error = xfs_alloc_update_counters(tp, agbp, len);
+ xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
+ if (error)
+ goto error0;
+
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
+
+ trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
+
+ return 0;
+
+ error0:
+ trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
+ if (bno_cur)
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ if (cnt_cur)
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_alloc_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+ (mp->m_sb.sb_agblocks + 1) / 2);
+ ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
+}
+
+/*
+ * Find the length of the longest extent in an AG. The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+ struct xfs_perag *pag,
+ xfs_extlen_t need,
+ xfs_extlen_t reserved)
+{
+ xfs_extlen_t delta = 0;
+
+ /*
+ * If the AGFL needs a recharge, we'll have to subtract that from the
+ * longest extent.
+ */
+ if (need > pag->pagf_flcount)
+ delta = need - pag->pagf_flcount;
+
+ /*
+ * If we cannot maintain others' reservations with space from the
+ * not-longest freesp extents, we'll have to subtract /that/ from
+ * the longest extent too.
+ */
+ if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+ delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+ /*
+ * If the longest extent is long enough to satisfy all the
+ * reservations and AGFL rules in place, we can return this extent.
+ */
+ if (pag->pagf_longest > delta)
+ return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ pag->pagf_longest - delta);
+
+ /* Otherwise, let the caller try for 1 block if there's space. */
+ return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
+ * return the largest possible minimum length.
+ */
+unsigned int
+xfs_alloc_min_freelist(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ /* AG btrees have at least 1 level. */
+ static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
+ const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
+ unsigned int min_free;
+
+ ASSERT(mp->m_alloc_maxlevels > 0);
+
+ /* space needed by-bno freespace btree */
+ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
+ mp->m_alloc_maxlevels);
+ /* space needed by-size freespace btree */
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
+ mp->m_alloc_maxlevels);
+ /* space needed reverse mapping used space btree */
+ if (xfs_has_rmapbt(mp))
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
+
+ return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t min_free,
+ int flags)
+{
+ struct xfs_perag *pag = args->pag;
+ xfs_extlen_t alloc_len, longest;
+ xfs_extlen_t reservation; /* blocks that are still reserved */
+ int available;
+ xfs_extlen_t agflcount;
+
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ return true;
+
+ reservation = xfs_ag_resv_needed(pag, args->resv);
+
+ /* do we have enough contiguous free space for the allocation? */
+ alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
+ longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
+ if (longest < alloc_len)
+ return false;
+
+ /*
+ * Do we have enough free space remaining for the allocation? Don't
+ * account extra agfl blocks because we are about to defer free them,
+ * making them unavailable until the current transaction commits.
+ */
+ agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
+ available = (int)(pag->pagf_freeblks + agflcount -
+ reservation - min_free - args->minleft);
+ if (available < (int)max(args->total, alloc_len))
+ return false;
+
+ /*
+ * Clamp maxlen to the amount of free space available for the actual
+ * extent allocation.
+ */
+ if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+ args->maxlen = available;
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->maxlen >= args->minlen);
+ }
+
+ return true;
+}
+
+int
+xfs_free_agfl_block(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ struct xfs_buf *agbp,
+ struct xfs_owner_info *oinfo)
+{
+ int error;
+ struct xfs_buf *bp;
+
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
+ XFS_AG_RESV_AGFL);
+ if (error)
+ return error;
+
+ error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
+ tp->t_mountp->m_bsize, 0, &bp);
+ if (error)
+ return error;
+ xfs_trans_binval(tp, bp);
+
+ return 0;
+}
+
+/*
+ * Check the agfl fields of the agf for inconsistency or corruption. The purpose
+ * is to detect an agfl header padding mismatch between current and early v5
+ * kernels. This problem manifests as a 1-slot size difference between the
+ * on-disk flcount and the active [first, last] range of a wrapped agfl. This
+ * may also catch variants of agfl count corruption unrelated to padding. Either
+ * way, we'll reset the agfl and warn the user.
+ *
+ * Return true if a reset is required before the agfl can be used, false
+ * otherwise.
+ */
+static bool
+xfs_agfl_needs_reset(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf)
+{
+ uint32_t f = be32_to_cpu(agf->agf_flfirst);
+ uint32_t l = be32_to_cpu(agf->agf_fllast);
+ uint32_t c = be32_to_cpu(agf->agf_flcount);
+ int agfl_size = xfs_agfl_size(mp);
+ int active;
+
+ /* no agfl header on v4 supers */
+ if (!xfs_has_crc(mp))
+ return false;
+
+ /*
+ * The agf read verifier catches severe corruption of these fields.
+ * Repeat some sanity checks to cover a packed -> unpacked mismatch if
+ * the verifier allows it.
+ */
+ if (f >= agfl_size || l >= agfl_size)
+ return true;
+ if (c > agfl_size)
+ return true;
+
+ /*
+ * Check consistency between the on-disk count and the active range. An
+ * agfl padding mismatch manifests as an inconsistent flcount.
+ */
+ if (c && l >= f)
+ active = l - f + 1;
+ else if (c)
+ active = agfl_size - f + l + 1;
+ else
+ active = 0;
+
+ return active != c;
+}
+
+/*
+ * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
+ * agfl content cannot be trusted. Warn the user that a repair is required to
+ * recover leaked blocks.
+ *
+ * The purpose of this mechanism is to handle filesystems affected by the agfl
+ * header padding mismatch problem. A reset keeps the filesystem online with a
+ * relatively minor free space accounting inconsistency rather than suffer the
+ * inevitable crash from use of an invalid agfl block.
+ */
+static void
+xfs_agfl_reset(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = agbp->b_addr;
+
+ ASSERT(pag->pagf_agflreset);
+ trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
+
+ xfs_warn(mp,
+ "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
+ "Please unmount and run xfs_repair.",
+ pag->pag_agno, pag->pagf_flcount);
+
+ agf->agf_flfirst = 0;
+ agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
+ agf->agf_flcount = 0;
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
+ XFS_AGF_FLCOUNT);
+
+ pag->pagf_flcount = 0;
+ pag->pagf_agflreset = false;
+}
+
+/*
+ * Defer an AGFL block free. This is effectively equivalent to
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
+ *
+ * Deferring AGFL frees helps prevent log reservation overruns due to too many
+ * allocation operations in a transaction. AGFL frees are prone to this problem
+ * because for one they are always freed one at a time. Further, an immediate
+ * AGFL block free can cause a btree join and require another block free before
+ * the real allocation can proceed. Deferring the free disconnects freeing up
+ * the AGFL slot from freeing the block.
+ */
+STATIC void
+xfs_defer_agfl_block(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t agbno,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent_free_item *new; /* new element */
+
+ ASSERT(xfs_extfree_item_cache != NULL);
+ ASSERT(oinfo != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+ new->xefi_blockcount = 1;
+ new->xefi_owner = oinfo->oi_owner;
+
+ trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_extent_free_item *new; /* new element */
+#ifdef DEBUG
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_extfree_item_cache != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (skip_discard)
+ new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (oinfo) {
+ ASSERT(oinfo->oi_offset == 0);
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ new->xefi_owner = oinfo->oi_owner;
+ } else {
+ new->xefi_owner = XFS_RMAP_OWN_NULL;
+ }
+ trace_xfs_bmap_free_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
+#ifdef DEBUG
+/*
+ * Check if an AGF has a free extent record whose length is equal to
+ * args->minlen.
+ */
+STATIC int
+xfs_exact_minlen_extent_available(
+ struct xfs_alloc_arg *args,
+ struct xfs_buf *agbp,
+ int *stat)
+{
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error = 0;
+
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 0) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 1 && flen != args->minlen)
+ *stat = 0;
+
+out:
+ xfs_btree_del_cursor(cnt_cur, error);
+
+ return error;
+}
+#endif
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+int /* error */
+xfs_alloc_fix_freelist(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ int flags) /* XFS_ALLOC_FLAG_... */
+{
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag = args->pag;
+ struct xfs_trans *tp = args->tp;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_buf *agflbp = NULL;
+ struct xfs_alloc_arg targs; /* local allocation arguments */
+ xfs_agblock_t bno; /* freelist block */
+ xfs_extlen_t need; /* total blocks needed in freelist */
+ int error = 0;
+
+ /* deferred ops (AGFL block frees) require permanent transactions */
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
+ goto out_no_agbp;
+ }
+ }
+
+ /*
+ * If this is a metadata preferred pag and we are user data then try
+ * somewhere else if we are not being asked to try harder at this
+ * point
+ */
+ if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
+ (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ goto out_agbp_relse;
+ }
+
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags |
+ XFS_ALLOC_FLAG_CHECK))
+ goto out_agbp_relse;
+
+ /*
+ * Get the a.g. freespace buffer.
+ * Can fail if we're not blocking on locks, and it's held.
+ */
+ if (!agbp) {
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
+ goto out_no_agbp;
+ }
+ }
+
+ /* reset a padding mismatched agfl before final free space check */
+ if (pag->pagf_agflreset)
+ xfs_agfl_reset(tp, agbp, pag);
+
+ /* If there isn't enough total space or single-extent, reject it. */
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
+
+#ifdef DEBUG
+ if (args->alloc_minlen_only) {
+ int stat;
+
+ error = xfs_exact_minlen_extent_available(args, agbp, &stat);
+ if (error || !stat)
+ goto out_agbp_relse;
+ }
+#endif
+ /*
+ * Make the freelist shorter if it's too long.
+ *
+ * Note that from this point onwards, we will always release the agf and
+ * agfl buffers on error. This handles the case where we error out and
+ * the buffers are clean or may not have been joined to the transaction
+ * and hence need to be released manually. If they have been joined to
+ * the transaction, then xfs_trans_brelse() will handle them
+ * appropriately based on the recursion count and dirty state of the
+ * buffer.
+ *
+ * XXX (dgc): When we have lots of free space, does this buy us
+ * anything other than extra overhead when we need to put more blocks
+ * back on the free list? Maybe we should only do this when space is
+ * getting low or the AGFL is more than half full?
+ *
+ * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
+ * big; the NORMAP flag prevents AGFL expand/shrink operations from
+ * updating the rmapbt. Both flags are used in xfs_repair while we're
+ * rebuilding the rmapbt, and neither are used by the kernel. They're
+ * both required to ensure that rmaps are correctly recorded for the
+ * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
+ * repair/rmap.c in xfsprogs for details.
+ */
+ memset(&targs, 0, sizeof(targs));
+ /* struct copy below */
+ if (flags & XFS_ALLOC_FLAG_NORMAP)
+ targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ else
+ targs.oinfo = XFS_RMAP_OINFO_AG;
+ while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
+ error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
+ if (error)
+ goto out_agbp_relse;
+
+ /* defer agfl frees */
+ xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ }
+
+ targs.tp = tp;
+ targs.mp = mp;
+ targs.agbp = agbp;
+ targs.agno = args->agno;
+ targs.alignment = targs.minlen = targs.prod = 1;
+ targs.type = XFS_ALLOCTYPE_THIS_AG;
+ targs.pag = pag;
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ goto out_agbp_relse;
+
+ /* Make the freelist longer if it's too short. */
+ while (pag->pagf_flcount < need) {
+ targs.agbno = 0;
+ targs.maxlen = need - pag->pagf_flcount;
+ targs.resv = XFS_AG_RESV_AGFL;
+
+ /* Allocate as many blocks as possible at once. */
+ error = xfs_alloc_ag_vextent(&targs);
+ if (error)
+ goto out_agflbp_relse;
+
+ /*
+ * Stop if we run out. Won't happen if callers are obeying
+ * the restrictions correctly. Can happen for free calls
+ * on a completely full ag.
+ */
+ if (targs.agbno == NULLAGBLOCK) {
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ break;
+ goto out_agflbp_relse;
+ }
+ /*
+ * Put each allocated block on the list.
+ */
+ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+ error = xfs_alloc_put_freelist(pag, tp, agbp,
+ agflbp, bno, 0);
+ if (error)
+ goto out_agflbp_relse;
+ }
+ }
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = agbp;
+ return 0;
+
+out_agflbp_relse:
+ xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+ args->agbp = NULL;
+ return error;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int
+xfs_alloc_get_freelist(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agblock_t *bnop,
+ int btreeblk)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_buf *agflbp;
+ xfs_agblock_t bno;
+ __be32 *agfl_bno;
+ int error;
+ uint32_t logflags;
+ struct xfs_mount *mp = tp->t_mountp;
+
+ /*
+ * Freelist is empty, give up.
+ */
+ if (!agf->agf_flcount) {
+ *bnop = NULLAGBLOCK;
+ return 0;
+ }
+ /*
+ * Read the array of free blocks.
+ */
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ return error;
+
+
+ /*
+ * Get the block number and update the data structures.
+ */
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+ bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ be32_add_cpu(&agf->agf_flfirst, 1);
+ xfs_trans_brelse(tp, agflbp);
+ if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
+ agf->agf_flfirst = 0;
+
+ ASSERT(!pag->pagf_agflreset);
+ be32_add_cpu(&agf->agf_flcount, -1);
+ pag->pagf_flcount--;
+
+ logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, 1);
+ pag->pagf_btreeblks++;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+ *bnop = bno;
+
+ return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
+{
+ int first; /* first byte offset */
+ int last; /* last byte offset */
+ static const short offsets[] = {
+ offsetof(xfs_agf_t, agf_magicnum),
+ offsetof(xfs_agf_t, agf_versionnum),
+ offsetof(xfs_agf_t, agf_seqno),
+ offsetof(xfs_agf_t, agf_length),
+ offsetof(xfs_agf_t, agf_roots[0]),
+ offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_flfirst),
+ offsetof(xfs_agf_t, agf_fllast),
+ offsetof(xfs_agf_t, agf_flcount),
+ offsetof(xfs_agf_t, agf_freeblks),
+ offsetof(xfs_agf_t, agf_longest),
+ offsetof(xfs_agf_t, agf_btreeblks),
+ offsetof(xfs_agf_t, agf_uuid),
+ offsetof(xfs_agf_t, agf_rmap_blocks),
+ offsetof(xfs_agf_t, agf_refcount_blocks),
+ offsetof(xfs_agf_t, agf_refcount_root),
+ offsetof(xfs_agf_t, agf_refcount_level),
+ /* needed so that we don't log the whole rest of the structure: */
+ offsetof(xfs_agf_t, agf_spare64),
+ sizeof(xfs_agf_t)
+ };
+
+ trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
+ xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+ xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int
+xfs_alloc_put_freelist(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_buf *agflbp,
+ xfs_agblock_t bno,
+ int btreeblk)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = agbp->b_addr;
+ __be32 *blockp;
+ int error;
+ uint32_t logflags;
+ __be32 *agfl_bno;
+ int startoff;
+
+ if (!agflbp) {
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ return error;
+ }
+
+ be32_add_cpu(&agf->agf_fllast, 1);
+ if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
+ agf->agf_fllast = 0;
+
+ ASSERT(!pag->pagf_agflreset);
+ be32_add_cpu(&agf->agf_flcount, 1);
+ pag->pagf_flcount++;
+
+ logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, -1);
+ pag->pagf_btreeblks--;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+
+ ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
+
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+ blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+ *blockp = cpu_to_be32(bno);
+ startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+
+ xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(tp, agflbp, startoff,
+ startoff + sizeof(xfs_agblock_t) - 1);
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_agf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_agf *agf = bp->b_addr;
+
+ if (xfs_has_crc(mp)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
+ return __this_address;
+ }
+
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+ be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+ mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+ mp->m_alloc_maxlevels)
+ return __this_address;
+
+ if (xfs_has_rmapbt(mp) &&
+ (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+ mp->m_rmap_maxlevels))
+ return __this_address;
+
+ if (xfs_has_rmapbt(mp) &&
+ be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ return __this_address;
+
+ if (xfs_has_lazysbcount(mp) &&
+ be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (xfs_has_reflink(mp) &&
+ be32_to_cpu(agf->agf_refcount_blocks) >
+ be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (xfs_has_reflink(mp) &&
+ (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+ be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
+ return __this_address;
+
+ return NULL;
+
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_agf_verify(bp);
+ if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agf *agf = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_agf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+ .verify_struct = xfs_agf_verify,
+};
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int
+xfs_read_agf(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
+
+ trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+ if (error)
+ return error;
+
+ xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
+ return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
+ */
+int
+xfs_alloc_read_agf(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
+{
+ struct xfs_buf *agfbp;
+ struct xfs_agf *agf;
+ int error;
+ int allocbt_blks;
+
+ trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
+
+ /* We don't support trylock when freeing. */
+ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
+ (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
+ error = xfs_read_agf(pag, tp,
+ (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+ &agfbp);
+ if (error)
+ return error;
+
+ agf = agfbp->b_addr;
+ if (!pag->pagf_init) {
+ pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+ pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+ pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+ pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+ pag->pagf_levels[XFS_BTNUM_BNOi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+ pag->pagf_levels[XFS_BTNUM_CNTi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+ pag->pagf_levels[XFS_BTNUM_RMAPi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
+ pag->pagf_init = 1;
+ pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
+
+ /*
+ * Update the in-core allocbt counter. Filter out the rmapbt
+ * subset of the btreeblks counter because the rmapbt is managed
+ * by perag reservation. Subtract one for the rmapbt root block
+ * because the rmap counter includes it while the btreeblks
+ * counter only tracks non-root blocks.
+ */
+ allocbt_blks = pag->pagf_btreeblks;
+ if (xfs_has_rmapbt(pag->pag_mount))
+ allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
+ if (allocbt_blks > 0)
+ atomic64_add(allocbt_blks,
+ &pag->pag_mount->m_allocbt_blks);
+ }
+#ifdef DEBUG
+ else if (!xfs_is_shutdown(pag->pag_mount)) {
+ ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+ ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+ ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+ ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+ ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+ ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ }
+#endif
+ if (agfbpp)
+ *agfbpp = agfbp;
+ else
+ xfs_trans_brelse(tp, agfbp);
+ return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int /* error */
+xfs_alloc_vextent(
+ struct xfs_alloc_arg *args) /* allocation argument structure */
+{
+ xfs_agblock_t agsize; /* allocation group size */
+ int error;
+ int flags; /* XFS_ALLOC_FLAG_... locking flags */
+ struct xfs_mount *mp; /* mount structure pointer */
+ xfs_agnumber_t sagno; /* starting allocation group number */
+ xfs_alloctype_t type; /* input allocation type */
+ int bump_rotor = 0;
+ xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+ mp = args->mp;
+ type = args->otype = args->type;
+ args->agbno = NULLAGBLOCK;
+ /*
+ * Just fix this up, for the case where the last a.g. is shorter
+ * (or there's only one a.g.) and the caller couldn't easily figure
+ * that out (xfs_bmap_alloc).
+ */
+ agsize = mp->m_sb.sb_agblocks;
+ if (args->maxlen > agsize)
+ args->maxlen = agsize;
+ if (args->alignment == 0)
+ args->alignment = 1;
+ ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+ ASSERT(args->minlen <= args->maxlen);
+ ASSERT(args->minlen <= agsize);
+ ASSERT(args->mod < args->prod);
+ if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+ XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+ args->minlen > args->maxlen || args->minlen > agsize ||
+ args->mod >= args->prod) {
+ args->fsbno = NULLFSBLOCK;
+ trace_xfs_alloc_vextent_badargs(args);
+ return 0;
+ }
+
+ switch (type) {
+ case XFS_ALLOCTYPE_THIS_AG:
+ case XFS_ALLOCTYPE_NEAR_BNO:
+ case XFS_ALLOCTYPE_THIS_BNO:
+ /*
+ * These three force us into a single a.g.
+ */
+ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ args->pag = xfs_perag_get(mp, args->agno);
+ error = xfs_alloc_fix_freelist(args, 0);
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ goto error0;
+ }
+ if (!args->agbp) {
+ trace_xfs_alloc_vextent_noagbp(args);
+ break;
+ }
+ args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ if ((error = xfs_alloc_ag_vextent(args)))
+ goto error0;
+ break;
+ case XFS_ALLOCTYPE_START_BNO:
+ /*
+ * Try near allocation first, then anywhere-in-ag after
+ * the first a.g. fails.
+ */
+ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
+ xfs_is_inode32(mp)) {
+ args->fsbno = XFS_AGB_TO_FSB(mp,
+ ((mp->m_agfrotor / rotorstep) %
+ mp->m_sb.sb_agcount), 0);
+ bump_rotor = 1;
+ }
+ args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ fallthrough;
+ case XFS_ALLOCTYPE_FIRST_AG:
+ /*
+ * Rotate through the allocation groups looking for a winner.
+ */
+ if (type == XFS_ALLOCTYPE_FIRST_AG) {
+ /*
+ * Start with allocation group given by bno.
+ */
+ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ sagno = 0;
+ flags = 0;
+ } else {
+ /*
+ * Start with the given allocation group.
+ */
+ args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
+ }
+ /*
+ * Loop over allocation groups twice; first time with
+ * trylock set, second time without.
+ */
+ for (;;) {
+ args->pag = xfs_perag_get(mp, args->agno);
+ error = xfs_alloc_fix_freelist(args, flags);
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ goto error0;
+ }
+ /*
+ * If we get a buffer back then the allocation will fly.
+ */
+ if (args->agbp) {
+ if ((error = xfs_alloc_ag_vextent(args)))
+ goto error0;
+ break;
+ }
+
+ trace_xfs_alloc_vextent_loopfailed(args);
+
+ /*
+ * Didn't work, figure out the next iteration.
+ */
+ if (args->agno == sagno &&
+ type == XFS_ALLOCTYPE_START_BNO)
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ /*
+ * For the first allocation, we can try any AG to get
+ * space. However, if we already have allocated a
+ * block, we don't want to try AGs whose number is below
+ * sagno. Otherwise, we may end up with out-of-order
+ * locking of AGF, which might cause deadlock.
+ */
+ if (++(args->agno) == mp->m_sb.sb_agcount) {
+ if (args->tp->t_firstblock != NULLFSBLOCK)
+ args->agno = sagno;
+ else
+ args->agno = 0;
+ }
+ /*
+ * Reached the starting a.g., must either be done
+ * or switch to non-trylock mode.
+ */
+ if (args->agno == sagno) {
+ if (flags == 0) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_vextent_allfailed(args);
+ break;
+ }
+
+ flags = 0;
+ if (type == XFS_ALLOCTYPE_START_BNO) {
+ args->agbno = XFS_FSB_TO_AGBNO(mp,
+ args->fsbno);
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ }
+ xfs_perag_put(args->pag);
+ }
+ if (bump_rotor) {
+ if (args->agno == sagno)
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ else
+ mp->m_agfrotor = (args->agno * rotorstep + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ }
+ break;
+ default:
+ ASSERT(0);
+ /* NOTREACHED */
+ }
+ if (args->agbno == NULLAGBLOCK)
+ args->fsbno = NULLFSBLOCK;
+ else {
+ args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(args->agbno % args->alignment == 0);
+ XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+ args->len);
+#endif
+
+ }
+ xfs_perag_put(args->pag);
+ return 0;
+error0:
+ xfs_perag_put(args->pag);
+ return error;
+}
+
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf **agbp)
+{
+ struct xfs_alloc_arg args;
+ int error;
+
+ memset(&args, 0, sizeof(struct xfs_alloc_arg));
+ args.tp = tp;
+ args.mp = tp->t_mountp;
+ args.agno = pag->pag_agno;
+ args.pag = pag;
+
+ /*
+ * validate that the block number is legal - the enables us to detect
+ * and handle a silent filesystem corruption rather than crashing.
+ */
+ if (args.agno >= args.mp->m_sb.sb_agcount)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+ if (error)
+ return error;
+
+ *agbp = args.agbp;
+ return 0;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int
+__xfs_free_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ struct xfs_agf *agf;
+ int error;
+ unsigned int busy_flags = 0;
+ struct xfs_perag *pag;
+
+ ASSERT(len != 0);
+ ASSERT(type != XFS_AG_RESV_AGFL);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_FREE_EXTENT))
+ return -EIO;
+
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+ if (error)
+ goto err;
+ agf = agbp->b_addr;
+
+ if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ error = -EFSCORRUPTED;
+ goto err_release;
+ }
+
+ /* validate the extent size is legal now we have the agf locked */
+ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto err_release;
+ }
+
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+ if (error)
+ goto err_release;
+
+ if (skip_discard)
+ busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
+ xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+ xfs_perag_put(pag);
+ return 0;
+
+err_release:
+ xfs_trans_brelse(tp, agbp);
+err:
+ xfs_perag_put(pag);
+ return error;
+}
+
+struct xfs_alloc_query_range_info {
+ xfs_alloc_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info *query = priv;
+ struct xfs_alloc_rec_incore irec;
+
+ irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ low_brec.a = *low_rec;
+ high_brec.a = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_alloc_query_range_helper, &query);
+}
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_alloc_has_record(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool *exists)
+{
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+
+ memset(&low, 0, sizeof(low));
+ low.a.ar_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.a.ar_startblock = bno + len - 1;
+
+ return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+/*
+ * Walk all the blocks in the AGFL. The @walk_fn can return any negative
+ * error code or XFS_ITER_*.
+ */
+int
+xfs_agfl_walk(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf,
+ struct xfs_buf *agflbp,
+ xfs_agfl_walk_fn walk_fn,
+ void *priv)
+{
+ __be32 *agfl_bno;
+ unsigned int i;
+ int error;
+
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+ i = be32_to_cpu(agf->agf_flfirst);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* Otherwise, walk from first to last, wrapping as needed. */
+ for (;;) {
+ error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (i == be32_to_cpu(agf->agf_fllast))
+ break;
+ if (++i == xfs_agfl_size(mp))
+ i = 0;
+ }
+
+ return 0;
+}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+ xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
+
+ return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_extfree_item_cache);
+ xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644
index 000000000..2c3f762df
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+extern struct workqueue_struct *xfs_alloc_wq;
+
+unsigned int xfs_agfl_size(struct xfs_mount *mp);
+
+/*
+ * Freespace allocation types. Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+ { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
+ { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
+ { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
+ { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
+ { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
+#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */
+#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */
+#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_mount *mp; /* file system mount point */
+ struct xfs_buf *agbp; /* buffer for a.g. freelist header */
+ struct xfs_perag *pag; /* per-ag struct for this agno */
+ xfs_fsblock_t fsbno; /* file system block number */
+ xfs_agnumber_t agno; /* allocation group number */
+ xfs_agblock_t agbno; /* allocation group-relative block # */
+ xfs_extlen_t minlen; /* minimum size of extent */
+ xfs_extlen_t maxlen; /* maximum size of extent */
+ xfs_extlen_t mod; /* mod value for extent size */
+ xfs_extlen_t prod; /* prod value for extent size */
+ xfs_extlen_t minleft; /* min blocks must be left after us */
+ xfs_extlen_t total; /* total blocks needed in xaction */
+ xfs_extlen_t alignment; /* align answer to multiple of this */
+ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
+ xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
+ xfs_agblock_t max_agbno; /* ... */
+ xfs_extlen_t len; /* output: actual size of extent */
+ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
+ xfs_alloctype_t otype; /* original allocation type */
+ int datatype; /* mask defining data type treatment */
+ char wasdel; /* set if allocation was prev delayed */
+ char wasfromfl; /* set if allocation is from freelist */
+ struct xfs_owner_info oinfo; /* owner of blocks being allocated */
+ enum xfs_ag_resv_type resv; /* block reservation to use */
+#ifdef DEBUG
+ bool alloc_minlen_only; /* allocate exact minlen extent */
+#endif
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for datatype
+ */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
+
+/* freespace limit calculations */
+unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
+unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
+
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
+ xfs_extlen_t need, xfs_extlen_t reserved);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
+ struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+ xfs_agblock_t bno, int btreeblk);
+
+/*
+ * Compute and fill in value of m_alloc_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+ struct xfs_mount *mp); /* file system mount structure */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *bp, /* buffer for a.g. freelist header */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int /* error */
+xfs_alloc_vextent(
+ xfs_alloc_arg_t *args); /* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int /* error */
+__xfs_free_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ const struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type, /* block reservation type */
+ bool skip_discard);
+
+static inline int
+xfs_free_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+}
+
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat); /* output: success/failure */
+
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **bpp);
+int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
+ struct xfs_buf *, struct xfs_owner_info *);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_buf **agbp);
+
+xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+
+typedef int (*xfs_alloc_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+ void *priv);
+
+int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exist);
+
+typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
+ void *priv);
+int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
+ struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+
+static inline __be32 *
+xfs_buf_to_agfl_bno(
+ struct xfs_buf *bp)
+{
+ if (xfs_has_crc(bp->b_mount))
+ return bp->b_addr + sizeof(struct xfs_agfl);
+ return bp->b_addr;
+}
+
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+ bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+ struct list_head xefi_list;
+ uint64_t xefi_owner;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ unsigned int xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ __xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache *xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
+#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000..549a3cba0
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache *xfs_allocbt_cur_cache;
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ int btnum = cur->bc_btnum;
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ int error;
+ xfs_agblock_t bno;
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp, &bno, 1);
+ if (error)
+ return error;
+
+ if (bno == NULLAGBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+
+ atomic64_inc(&cur->bc_mp->m_allocbt_blks);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
+
+ new->s = cpu_to_be32(bno);
+
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+ error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+ bno, 1);
+ if (error)
+ return error;
+
+ atomic64_dec(&cur->bc_mp->m_allocbt_blks);
+ xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+ return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
+{
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
+ struct xfs_perag *pag;
+ __be32 len;
+ int numrecs;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+ switch (reason) {
+ case LASTREC_UPDATE:
+ /*
+ * If this is the last leaf block and it's the last record,
+ * then update the size of the longest extent in the AG.
+ */
+ if (ptr != xfs_btree_get_numrecs(block))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_INSREC:
+ if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+ be32_to_cpu(agf->agf_longest))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_DELREC:
+ numrecs = xfs_btree_get_numrecs(block);
+ if (ptr <= numrecs)
+ return;
+ ASSERT(ptr == numrecs + 1);
+
+ if (numrecs) {
+ xfs_alloc_rec_t *rrp;
+
+ rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+ len = rrp->ar_blockcount;
+ } else {
+ len = 0;
+ }
+
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ agf->agf_longest = len;
+ pag = cur->bc_ag.agbp->b_pag;
+ pag->pagf_longest = be32_to_cpu(len);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->alloc.ar_startblock = rec->alloc.ar_startblock;
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_bnobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->alloc.ar_startblock);
+ x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+ key->alloc.ar_startblock = cpu_to_be32(x);
+ key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+ key->alloc.ar_startblock = 0;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+ rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
+
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+ ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC int64_t
+xfs_bnobt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
+
+ return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC int64_t
+xfs_cntbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
+ int64_t diff;
+
+ diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+ if (diff)
+ return diff;
+
+ return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC int64_t
+xfs_bnobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC int64_t
+xfs_cntbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ int64_t diff;
+
+ diff = be32_to_cpu(k1->alloc.ar_blockcount) -
+ be32_to_cpu(k2->alloc.ar_blockcount);
+ if (diff)
+ return diff;
+
+ return be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+static xfs_failaddr_t
+xfs_allocbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ xfs_failaddr_t fa;
+ unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
+
+ /*
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
+ *
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
+ */
+ level = be16_to_cpu(block->bb_level);
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
+ return __this_address;
+ } else if (level >= mp->m_alloc_maxlevels)
+ return __this_address;
+
+ return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+}
+
+static void
+xfs_allocbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_allocbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_allocbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_allocbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
+
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
+
+STATIC int
+xfs_bnobt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_bnobt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
+}
+
+STATIC int
+xfs_cntbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
+}
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_bnobt_key_diff,
+ .buf_ops = &xfs_bnobt_buf_ops,
+ .diff_two_keys = xfs_bnobt_diff_two_keys,
+ .keys_inorder = xfs_bnobt_keys_inorder,
+ .recs_inorder = xfs_bnobt_recs_inorder,
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_cntbt_key_diff,
+ .buf_ops = &xfs_cntbt_buf_ops,
+ .diff_two_keys = xfs_cntbt_diff_two_keys,
+ .keys_inorder = xfs_cntbt_keys_inorder,
+ .recs_inorder = xfs_cntbt_recs_inorder,
+};
+
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+ xfs_allocbt_cur_cache);
+ cur->bc_ag.abt.active = false;
+
+ if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_ops = &xfs_cntbt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+ } else {
+ cur->bc_ops = &xfs_bnobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ }
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ if (xfs_has_crc(mp))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ return cur;
+}
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur * /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agf structure */
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum) /* btree identifier */
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
+ if (btnum == XFS_BTNUM_CNT)
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ else
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+
+ cur->bc_ag.agbp = agbp;
+
+ return cur;
+}
+
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new free space btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ } else {
+ cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ }
+}
+
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+ return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
+}
+
+/* Calculate the freespace btree size for some records. */
+xfs_extlen_t
+xfs_allocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_alloc_mnr, len);
+}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+ xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+ xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_allocbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_allocbt_cur_cache);
+ xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644
index 000000000..45df893ef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+ (xfs_has_crc(((mp))) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+ ((xfs_alloc_rec_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+ ((xfs_alloc_key_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_alloc_ptr_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_alloc_key_t) + \
+ ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
+#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000..e28d93d23
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1600 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+
+struct kmem_cache *xfs_attr_intent_cache;
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
+static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_lookup(struct xfs_da_args *args,
+ struct xfs_da_state *state);
+
+int
+xfs_inode_hasattr(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_has_attr_fork(ip))
+ return 0;
+ if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * Returns true if the there is exactly only block in the attr fork, in which
+ * case the attribute fork consists of a single leaf block entry.
+ */
+bool
+xfs_attr_is_leaf(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap;
+
+ if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+ return false;
+
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_get_extent(ifp, &icur, &imap);
+ return imap.br_startoff == 0 && imap.br_blockcount == 1;
+}
+
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+/*
+ * Retrieve an extended attribute and its value. Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+int
+xfs_attr_get_ilocked(
+ struct xfs_da_args *args)
+{
+ ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+ if (!xfs_inode_hasattr(args->dp))
+ return -ENOATTR;
+
+ if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_shortform_getvalue(args);
+ if (xfs_attr_is_leaf(args->dp))
+ return xfs_attr_leaf_get(args);
+ return xfs_attr_node_get(args);
+}
+
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If args->valuelen is zero, then the caller does not want the value, just an
+ * indication whether the attribute exists and the size of the value if it
+ * exists. The size is returned in args.valuelen.
+ *
+ * If args->value is NULL but args->valuelen is non-zero, allocate the buffer
+ * for the value after existence of the attribute has been determined. The
+ * caller always has to free args->value if it is set, no matter if this
+ * function was successful or not.
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * args->valuelen, return -ERANGE with the size of the attribute that was found
+ * in args->valuelen.
+ */
+int
+xfs_attr_get(
+ struct xfs_da_args *args)
+{
+ uint lock_mode;
+ int error;
+
+ XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
+
+ if (xfs_is_shutdown(args->dp->i_mount))
+ return -EIO;
+
+ args->geo = args->dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+
+ /* Entirely possible to look up a name which doesn't exist */
+ args->op_flags = XFS_DA_OP_OKNOENT;
+
+ lock_mode = xfs_ilock_attr_map_shared(args->dp);
+ error = xfs_attr_get_ilocked(args);
+ xfs_iunlock(args->dp, lock_mode);
+
+ return error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+ struct xfs_da_args *args,
+ int *local)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ int size;
+ int nblks;
+
+ /*
+ * Determine space new attribute will use, and if it would be
+ * "local" or "remote" (note: local != inline).
+ */
+ size = xfs_attr_leaf_newentsize(args, local);
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ if (*local) {
+ if (size > (args->geo->blksize / 2)) {
+ /* Double split possible */
+ nblks *= 2;
+ }
+ } else {
+ /*
+ * Out of line attribute, cannot double split, but
+ * make room for the attribute value itself.
+ */
+ uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ nblks += dblocks;
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+ }
+
+ return nblks;
+}
+
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
+STATIC int
+xfs_attr_try_sf_addname(
+ struct xfs_inode *dp,
+ struct xfs_da_args *args)
+{
+
+ int error;
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
+
+ error = xfs_attr_shortform_addname(args);
+ if (error == -ENOSPC)
+ return error;
+
+ /*
+ * Commit the shortform mods, and we're done.
+ * NOTE: this is also the error path (EEXIST, etc).
+ */
+ if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+ if (xfs_has_wsync(dp->i_mount))
+ xfs_trans_set_sync(args->trans);
+
+ return error;
+}
+
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args);
+ if (error)
+ return error;
+
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
+ */
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_intent *attr,
+ enum xfs_delattr_state replace_state)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
+ /*
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion.
+ */
+ error = xfs_attr_leaf_try_add(args);
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
+
+ /*
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
+ */
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
+ */
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
+
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
+ /*
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
+ */
+ goto out;
+ }
+ if (error)
+ return error;
+
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
+
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
+ return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
+
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
+ return error;
+
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
+ /*
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
+ */
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
+
+ /*
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
+ */
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
+
+ /*
+ * Mark the attribute as INCOMPLETE
+ */
+ return xfs_attr3_leaf_setflag(args);
+}
+
+/* Ensure the da state of an xattr deferred work item is ready to go. */
+static inline void
+xfs_attr_item_init_da_state(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+
+ if (!attr->xattri_da_state)
+ attr->xattri_da_state = xfs_da_state_alloc(args);
+ else
+ xfs_da_state_reset(attr->xattri_da_state, args);
+}
+
+/*
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
+ */
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state;
+ int error;
+
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
+
+ state = attr->xattri_da_state;
+ ASSERT(state->path.blk[state->path.active - 1].bp != NULL);
+ ASSERT(state->path.blk[state->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
+
+ error = xfs_attr_leaf_mark_incomplete(args, state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
+
+ xfs_attr3_leaf_remove(bp, args);
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+
+ return error;
+}
+
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
+
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
+
+ return error;
+}
+
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
+
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
+ }
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
+ fallthrough;
+
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
+ /*
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
+ */
+ error = xfs_attr3_leaf_flipflags(args);
+ if (error)
+ return error;
+ /*
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
+ */
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
+ /*
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
+ */
+ xfs_attr_restore_rmt_blk(args);
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_invalidate(args);
+ if (error)
+ return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
+
+ attr->xattri_dela_state++;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
+ */
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+static int
+xfs_attr_lookup(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_da_state *state;
+ int error;
+
+ if (!xfs_inode_hasattr(dp))
+ return -ENOATTR;
+
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_sf_findname(args, NULL, NULL);
+
+ if (xfs_attr_is_leaf(dp)) {
+ error = xfs_attr_leaf_hasname(args, &bp);
+
+ if (bp)
+ xfs_trans_brelse(args->trans, bp);
+
+ return error;
+ }
+
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ xfs_da_state_free(state);
+ return error;
+}
+
+static int
+xfs_attr_intent_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+{
+
+ struct xfs_attr_intent *new;
+
+ new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
+
+ *attr = new;
+ return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_intent *new;
+ int error;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/*
+ * Note: If args->value is NULL the attribute will be removed, just like the
+ * Linux ->setattr API.
+ */
+int
+xfs_attr_set(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans_res tres;
+ bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
+ int error, local;
+ int rmt_blks = 0;
+ unsigned int total;
+
+ if (xfs_is_shutdown(dp->i_mount))
+ return -EIO;
+
+ error = xfs_qm_dqattach(dp);
+ if (error)
+ return error;
+
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+
+ /*
+ * We have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail as well. Preserve the logged flag, since we need
+ * to pass that through to the logging code.
+ */
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_LOGGED);
+
+ if (args->value) {
+ XFS_STATS_INC(mp, xs_attr_set);
+ args->total = xfs_attr_calc_size(args, &local);
+
+ /*
+ * If the inode doesn't have an attribute fork, add one.
+ * (inode must not be locked when we call this routine)
+ */
+ if (xfs_inode_has_attr_fork(dp) == 0) {
+ int sf_size = sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(args->namelen,
+ args->valuelen);
+
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
+ }
+
+ if (!local)
+ rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ } else {
+ XFS_STATS_INC(mp, xs_attr_remove);
+ rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+ }
+
+ /*
+ * Root fork attributes can use reserved data blocks for this
+ * operation if necessary
+ */
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
+ if (error)
+ return error;
+
+ if (args->value || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ error = xfs_attr_lookup(args);
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
+ goto out_trans_cancel;
+
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
+ goto out_trans_cancel;
+
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
+ goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
+ }
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(args->trans);
+
+ if (!(args->op_flags & XFS_DA_OP_NOTIME))
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+ /*
+ * Commit the last in the sequence of transactions.
+ */
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args->trans);
+out_unlock:
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
+
+out_trans_cancel:
+ if (args->trans)
+ xfs_trans_cancel(args->trans);
+ goto out_unlock;
+}
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+ struct xfs_attr_shortform *sf;
+
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+ return be16_to_cpu(sf->hdr.totsize);
+}
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
+{
+ int newsize, forkoff;
+ int error;
+
+ trace_xfs_attr_sf_addname(args);
+
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
+ /*
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
+ */
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
+ }
+
+ if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+ args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return -ENOSPC;
+
+ newsize = xfs_attr_sf_totsize(args->dp);
+ newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
+
+ forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+ if (!forkoff)
+ return -ENOSPC;
+
+ xfs_attr_shortform_add(args, forkoff);
+ return 0;
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/* Save the current remote block info and clear the current pointers. */
+static void
+xfs_attr_save_rmt_blk(
+ struct xfs_da_args *args)
+{
+ args->blkno2 = args->blkno;
+ args->index2 = args->index;
+ args->rmtblkno2 = args->rmtblkno;
+ args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
+}
+
+/* Set stored info about a remote block */
+static void
+xfs_attr_restore_rmt_blk(
+ struct xfs_da_args *args)
+{
+ args->blkno = args->blkno2;
+ args->index = args->index2;
+ args->rmtblkno = args->rmtblkno2;
+ args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
+}
+
+/*
+ * Tries to add an attribute to an inode in leaf form
+ *
+ * This function is meant to execute as part of a delayed operation and leaves
+ * the transaction handling to the caller. On success the attribute is added
+ * and the inode and transaction are left dirty. If there is not enough space,
+ * the attr data is converted to node format and -ENOSPC is returned. Caller is
+ * responsible for handling the dirty inode and transaction or adding the attr
+ * in node format.
+ */
+STATIC int
+xfs_attr_leaf_try_add(
+ struct xfs_da_args *args)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+
+ /*
+ * Look up the xattr name to set the insertion point for the new xattr.
+ */
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ goto out_brelse;
+
+ trace_xfs_attr_leaf_replace(args);
+ /*
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
+ }
+
+ return xfs_attr3_leaf_add(bp, args);
+
+out_brelse:
+ xfs_trans_brelse(args->trans, bp);
+ return error;
+}
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+STATIC int
+xfs_attr_leaf_hasname(
+ struct xfs_da_args *args,
+ struct xfs_buf **bp)
+{
+ int error = 0;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+ if (error)
+ return error;
+
+ error = xfs_attr3_leaf_lookup_int(*bp, args);
+ if (error != -ENOATTR && error != -EEXIST)
+ xfs_trans_brelse(args->trans, *bp);
+
+ return error;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp;
+ struct xfs_buf *bp;
+ int error, forkoff;
+
+ trace_xfs_attr_leaf_removename(args);
+
+ /*
+ * Remove the attribute.
+ */
+ dp = args->dp;
+
+ error = xfs_attr_leaf_hasname(args, &bp);
+ if (error == -ENOATTR) {
+ xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
+ return error;
+ } else if (error != -EEXIST)
+ return error;
+
+ xfs_attr3_leaf_remove(bp, args);
+
+ /*
+ * If the result is small enough, shrink it all into the inode.
+ */
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ return xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+
+ return 0;
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ trace_xfs_attr_leaf_get(args);
+
+ error = xfs_attr_leaf_hasname(args, &bp);
+
+ if (error == -ENOATTR) {
+ xfs_trans_brelse(args->trans, bp);
+ return error;
+ } else if (error != -EEXIST)
+ return error;
+
+
+ error = xfs_attr3_leaf_getvalue(bp, args);
+ xfs_trans_brelse(args->trans, bp);
+ return error;
+}
+
+/* Return EEXIST if attr is found, or ENOATTR if not. */
+STATIC int
+xfs_attr_node_lookup(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int retval, error;
+
+ /*
+ * Search to see if name exists, and get back a pointer to it.
+ */
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ return error;
+
+ return retval;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+STATIC int
+xfs_attr_node_addname_find_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ /*
+ * Search to see if name already exists, and get back a pointer
+ * to where it should go.
+ */
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ goto error;
+
+
+ trace_xfs_attr_node_replace(args);
+ /*
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
+ }
+
+ return 0;
+error:
+ if (attr->xattri_da_state) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
+ return error;
+}
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_state *state = attr->xattri_da_state;
+ struct xfs_da_state_blk *blk;
+ int error;
+
+ trace_xfs_attr_node_addname(state->args);
+
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ error = xfs_attr3_leaf_add(blk->bp, state->args);
+ if (error == -ENOSPC) {
+ if (state->path.active == 1) {
+ /*
+ * Its really a single leaf node, but it had
+ * out-of-line values so it looked like it *might*
+ * have been a b-tree. Let the caller deal with this.
+ */
+ goto out;
+ }
+
+ /*
+ * Split as many Btree elements as required.
+ * This code tracks the new and old attr's location
+ * in the index/blkno/rmtblkno/rmtblkcnt fields and
+ * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+ */
+ error = xfs_da3_split(state);
+ if (error)
+ goto out;
+ } else {
+ /*
+ * Addition succeeded, update Btree hashvals.
+ */
+ xfs_da3_fixhashpath(state, &state->path);
+ }
+
+out:
+ xfs_da_state_free(state);
+ attr->xattri_da_state = NULL;
+ return error;
+}
+
+static int
+xfs_attr_node_removename(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *blk;
+ int retval;
+
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
+
+ return retval;
+}
+
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = xfs_da_state_alloc(args);
+ int retval = 0;
+ int error = 0;
+
+ /*
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
+ */
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
+
+ error = xfs_attr_node_removename(args, state);
+
+ /*
+ * Check to see if the tree needs to be collapsed.
+ */
+ if (retval && (state->path.active > 1)) {
+ error = xfs_da3_join(state);
+ if (error)
+ goto out;
+ }
+ retval = error = 0;
+
+out:
+ xfs_da_state_free(state);
+ if (error)
+ return error;
+ return retval;
+}
+
+/*
+ * Retrieve the attribute data from a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+STATIC int
+xfs_attr_node_get(
+ struct xfs_da_args *args)
+{
+ struct xfs_da_state *state;
+ struct xfs_da_state_blk *blk;
+ int i;
+ int error;
+
+ trace_xfs_attr_node_get(args);
+
+ /*
+ * Search to see if name exists, and get back a pointer to it.
+ */
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ if (error != -EEXIST)
+ goto out_release;
+
+ /*
+ * Get the value, local or "remote"
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ error = xfs_attr3_leaf_getvalue(blk->bp, args);
+
+ /*
+ * If not in a transaction, we have to release all the buffers.
+ */
+out_release:
+ for (i = 0; i < state->path.active; i++) {
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+
+ xfs_da_state_free(state);
+ return error;
+}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
+
+int __init
+xfs_attr_intent_init_cache(void)
+{
+ xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent",
+ sizeof(struct xfs_attr_intent),
+ 0, 0, NULL);
+
+ return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attr_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attr_intent_cache);
+ xfs_attr_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
new file mode 100644
index 000000000..81be9b3e4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -0,0 +1,621 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_H__
+#define __XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call. Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+struct xfs_attrlist_cursor_kern {
+ __u32 hashval; /* hash value of next entry to add */
+ __u32 blkno; /* block containing entry (suggestion) */
+ __u32 offset; /* offset in list of equal-hashvals */
+ __u16 pad1; /* padding to match user-level */
+ __u8 pad2; /* padding to match user-level */
+ __u8 initted; /* T/F: cursor has been initialized */
+};
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+ unsigned char *, int, int);
+
+struct xfs_attr_list_context {
+ struct xfs_trans *tp;
+ struct xfs_inode *dp; /* inode */
+ struct xfs_attrlist_cursor_kern cursor; /* position in list */
+ void *buffer; /* output buffer */
+
+ /*
+ * Abort attribute list iteration if non-zero. Can be used to pass
+ * error values to the xfs_attr_list caller.
+ */
+ int seen_enough;
+ bool allow_incomplete;
+
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */
+ int resynch; /* T/F: resynch with cursor */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+};
+
+
+/*
+ * ========================================================================
+ * Structure used to pass context around among the delayed routines.
+ * ========================================================================
+ */
+
+/*
+ * Below is a state machine diagram for attr remove operations. The XFS_DAS_*
+ * states indicate places where the function would return -EAGAIN, and then
+ * immediately resume from after being called by the calling function. States
+ * marked as a "subroutine state" indicate that they belong to a subroutine, and
+ * so the calling function needs to pass them back to that subroutine to allow
+ * it to finish where it left off. But they otherwise do not have a role in the
+ * calling function other than just passing through.
+ *
+ * xfs_attr_remove_iter()
+ * │
+ * v
+ * have attr to remove? ──n──> done
+ * │
+ * y
+ * │
+ * v
+ * are we short form? ──y──> xfs_attr_shortform_remove ──> done
+ * │
+ * n
+ * │
+ * V
+ * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done
+ * │
+ * n
+ * │
+ * V
+ * ┌── need to setup state?
+ * │ │
+ * n y
+ * │ │
+ * │ v
+ * │ find attr and get state
+ * │ attr has remote blks? ──n─┐
+ * │ │ v
+ * │ │ find and invalidate
+ * │ y the remote blocks.
+ * │ │ mark attr incomplete
+ * │ ├────────────────┘
+ * └──────────┤
+ * │
+ * v
+ * Have remote blks to remove? ───y─────┐
+ * │ ^ remove the blks
+ * │ │ │
+ * │ │ v
+ * │ XFS_DAS_RMTBLK <─n── done?
+ * │ re-enter with │
+ * │ one less blk to y
+ * │ remove │
+ * │ V
+ * │ refill the state
+ * n │
+ * │ v
+ * │ XFS_DAS_RM_NAME
+ * │ │
+ * ├─────────────────────────┘
+ * │
+ * v
+ * remove leaf and
+ * update hash with
+ * xfs_attr_node_remove_cleanup
+ * │
+ * v
+ * need to
+ * shrink tree? ─n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * join leaf │
+ * │ │
+ * v │
+ * XFS_DAS_RM_SHRINK │
+ * │ │
+ * v │
+ * do the shrink │
+ * │ │
+ * v │
+ * free state <──┘
+ * │
+ * v
+ * done
+ *
+ *
+ * Below is a state machine diagram for attr set operations.
+ *
+ * It seems the challenge with understanding this system comes from trying to
+ * absorb the state machine all at once, when really one should only be looking
+ * at it with in the context of a single function. Once a state sensitive
+ * function is called, the idea is that it "takes ownership" of the
+ * state machine. It isn't concerned with the states that may have belonged to
+ * it's calling parent. Only the states relevant to itself or any other
+ * subroutines there in. Once a calling function hands off the state machine to
+ * a subroutine, it needs to respect the simple rule that it doesn't "own" the
+ * state machine anymore, and it's the responsibility of that calling function
+ * to propagate the -EAGAIN back up the call stack. Upon reentry, it is
+ * committed to re-calling that subroutine until it returns something other than
+ * -EAGAIN. Once that subroutine signals completion (by returning anything other
+ * than -EAGAIN), the calling function can resume using the state machine.
+ *
+ * xfs_attr_set_iter()
+ * │
+ * v
+ * ┌─y─ has an attr fork?
+ * │ |
+ * │ n
+ * │ |
+ * │ V
+ * │ add a fork
+ * │ │
+ * └──────────┤
+ * │
+ * V
+ * ┌─── is shortform?
+ * │ │
+ * │ y
+ * │ │
+ * │ V
+ * │ xfs_attr_set_fmt
+ * │ |
+ * │ V
+ * │ xfs_attr_try_sf_addname
+ * │ │
+ * │ V
+ * │ had enough ──y──> done
+ * │ space?
+ * n │
+ * │ n
+ * │ │
+ * │ V
+ * │ transform to leaf
+ * │ │
+ * │ V
+ * │ hold the leaf buffer
+ * │ │
+ * │ V
+ * │ return -EAGAIN
+ * │ Re-enter in
+ * │ leaf form
+ * │
+ * └─> release leaf buffer
+ * if needed
+ * │
+ * V
+ * ┌───n── fork has
+ * │ only 1 blk?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ xfs_attr_leaf_try_add()
+ * │ │
+ * │ v
+ * │ had enough ──────────────y─────────────┐
+ * │ space? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ return -EAGAIN │
+ * │ re-enter in │
+ * │ node form │
+ * │ │ │
+ * └──────────┤ │
+ * │ │
+ * V │
+ * xfs_attr_node_addname_find_attr │
+ * determines if this │
+ * is create or rename │
+ * find space to store attr │
+ * │ │
+ * v │
+ * xfs_attr_node_addname │
+ * │ │
+ * v │
+ * fits in a node leaf? ────n─────┐ │
+ * │ ^ v │
+ * │ │ single leaf node? │
+ * │ │ │ │ │
+ * y │ y n │
+ * │ │ │ │ │
+ * v │ v v │
+ * update │ grow the leaf split if │
+ * hashvals └── return -EAGAIN needed │
+ * │ retry leaf add │ │
+ * │ on reentry │ │
+ * ├────────────────────────────┘ │
+ * │ │
+ * v │
+ * need to alloc │
+ * ┌─y── or flip flag? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ done │
+ * │ │
+ * │ │
+ * │ XFS_DAS_FOUND_LBLK <────────────────┘
+ * │ │
+ * │ V
+ * │ xfs_attr_leaf_addname()
+ * │ │
+ * │ v
+ * │ ┌──first time through?
+ * │ │ │
+ * │ │ y
+ * │ │ │
+ * │ n v
+ * │ │ if we have rmt blks
+ * │ │ find space for them
+ * │ │ │
+ * │ └──────────┤
+ * │ │
+ * │ v
+ * │ still have
+ * │ ┌─n─ blks to alloc? <──┐
+ * │ │ │ │
+ * │ │ y │
+ * │ │ │ │
+ * │ │ v │
+ * │ │ alloc one blk │
+ * │ │ return -EAGAIN ──┘
+ * │ │ re-enter with one
+ * │ │ less blk to alloc
+ * │ │
+ * │ │
+ * │ └───> set the rmt
+ * │ value
+ * │ │
+ * │ v
+ * │ was this
+ * │ a rename? ──n─┐
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ v │
+ * │ flip incomplete │
+ * │ flag │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_FLIP_LFLAG │
+ * │ │ │
+ * │ v │
+ * │ need to remove │
+ * │ old bks? ──n──┤
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ V │
+ * │ remove │
+ * │ ┌───> old blks │
+ * │ │ │ │
+ * │ XFS_DAS_RM_LBLK │ │
+ * │ ^ │ │
+ * │ │ v │
+ * │ └──y── more to │
+ * │ remove? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_RD_LEAF │
+ * │ │ │
+ * │ v │
+ * │ remove leaf │
+ * │ │ │
+ * │ v │
+ * │ shrink to sf │
+ * │ if needed │
+ * │ │ │
+ * │ v │
+ * │ done <──────┘
+ * │
+ * └──────> XFS_DAS_FOUND_NBLK
+ * │
+ * v
+ * ┌─────n── need to
+ * │ alloc blks?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ find space
+ * │ │
+ * │ v
+ * │ ┌─>XFS_DAS_ALLOC_NODE
+ * │ │ │
+ * │ │ v
+ * │ │ alloc blk
+ * │ │ │
+ * │ │ v
+ * │ └──y── need to alloc
+ * │ more blocks?
+ * │ │
+ * │ n
+ * │ │
+ * │ v
+ * │ set the rmt value
+ * │ │
+ * │ v
+ * │ was this
+ * └────────> a rename? ──n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * flip incomplete │
+ * flag │
+ * │ │
+ * v │
+ * XFS_DAS_FLIP_NFLAG │
+ * │ │
+ * v │
+ * need to │
+ * remove blks? ─n──┤
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * remove │
+ * ┌────────> old blks │
+ * │ │ │
+ * XFS_DAS_RM_NBLK │ │
+ * ^ │ │
+ * │ v │
+ * └──────y── more to │
+ * remove │
+ * │ │
+ * n │
+ * │ │
+ * v │
+ * XFS_DAS_CLR_FLAG │
+ * │ │
+ * v │
+ * clear flags │
+ * │ │
+ * ├──────────┘
+ * │
+ * v
+ * done
+ */
+
+/*
+ * Enum values for xfs_attr_intent.xattri_da_state
+ *
+ * These values are used by delayed attribute operations to keep track of where
+ * they were before they returned -EAGAIN. A return code of -EAGAIN signals the
+ * calling function to roll the transaction, and then call the subroutine to
+ * finish the operation. The enum is then used by the subroutine to jump back
+ * to where it was and resume executing where it left off.
+ */
+enum xfs_delattr_state {
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
+};
+
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
+struct xfs_attri_log_nameval;
+
+/*
+ * Context used for keeping track of delayed attribute operations
+ */
+struct xfs_attr_intent {
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
+
+ /* Used in xfs_attr_node_removename to roll through removing blocks */
+ struct xfs_da_state *xattri_da_state;
+
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Shared buffer containing the attr name and value so that the logging
+ * code can share large memory buffers between log items.
+ */
+ struct xfs_attri_log_nameval *xattri_nameval;
+
+ /* Used to keep track of current state of delayed operation */
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
+ struct xfs_bmbt_irec xattri_map;
+};
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+bool xfs_attr_is_leaf(struct xfs_inode *ip);
+int xfs_attr_get_ilocked(struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_da_args *args);
+int xfs_attr_set(struct xfs_da_args *args);
+int xfs_attr_set_iter(struct xfs_attr_intent *attr);
+int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
+bool xfs_attr_namecheck(const void *name, size_t length);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_af is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!xfs_inode_has_attr_fork(args->dp))
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (args->op_flags & XFS_DA_OP_LOGGED)
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
+
+extern struct kmem_cache *xfs_attr_intent_cache;
+int __init xfs_attr_intent_init_cache(void);
+void xfs_attr_intent_destroy_cache(void);
+
+#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000..beee51ad7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,3002 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_errortag.h"
+
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+ xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *leaf_blk_1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ xfs_da_state_blk_t *leaf_blk_2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *number_entries_in_blk1,
+ int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+ struct xfs_attr_leafblock *src_leaf,
+ struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+ struct xfs_attr_leafblock *dst_leaf,
+ struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+ int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+
+static void
+xfs_attr3_leaf_firstused_from_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ struct xfs_attr3_leaf_hdr *hdr3;
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+ to->firstused = be16_to_cpu(hdr3->firstused);
+ } else {
+ to->firstused = be16_to_cpu(from->hdr.firstused);
+ }
+
+ /*
+ * Convert from the magic fsb size value to actual blocksize. This
+ * should only occur for empty blocks when the block size overflows
+ * 16-bits.
+ */
+ if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+ ASSERT(!to->count && !to->usedbytes);
+ ASSERT(geo->blksize > USHRT_MAX);
+ to->firstused = geo->blksize;
+ }
+}
+
+static void
+xfs_attr3_leaf_firstused_to_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ struct xfs_attr3_leaf_hdr *hdr3;
+ uint32_t firstused;
+
+ /* magic value should only be seen on disk */
+ ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+
+ /*
+ * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+ * value. This only overflows at the max supported value of 64k. Use the
+ * magic on-disk value to represent block size in this case.
+ */
+ firstused = from->firstused;
+ if (firstused > USHRT_MAX) {
+ ASSERT(from->firstused == geo->blksize);
+ firstused = XFS_ATTR3_LEAF_NULLOFF;
+ }
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+ hdr3->firstused = cpu_to_be16(firstused);
+ } else {
+ to->hdr.firstused = cpu_to_be16(firstused);
+ }
+}
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ int i;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+ xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+ to->holes = hdr3->holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+ }
+ return;
+ }
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+ xfs_attr3_leaf_firstused_from_disk(geo, to, from);
+ to->holes = from->hdr.holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+ }
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ int i;
+
+ ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+ from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+ xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+ hdr3->holes = from->holes;
+ hdr3->pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+ return;
+ }
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+ xfs_attr3_leaf_firstused_to_disk(geo, to, from);
+ to->hdr.holes = from->holes;
+ to->hdr.pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+}
+
+static xfs_failaddr_t
+xfs_attr3_leaf_verify_entry(
+ struct xfs_mount *mp,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ __u32 *last_hashval)
+{
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ char *name_end;
+ unsigned int nameidx;
+ unsigned int namesize;
+ __u32 hashval;
+
+ /* hash order check */
+ hashval = be32_to_cpu(ent->hashval);
+ if (hashval < *last_hashval)
+ return __this_address;
+ *last_hashval = hashval;
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize)
+ return __this_address;
+
+ /*
+ * Check the name information. The namelen fields are u8 so we can't
+ * possibly exceed the maximum name length of 255 bytes.
+ */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ return __this_address;
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0)
+ return __this_address;
+ if (!(ent->flags & XFS_ATTR_INCOMPLETE) &&
+ rentry->valueblk == 0)
+ return __this_address;
+ }
+
+ if (name_end > buf_end)
+ return __this_address;
+
+ return NULL;
+}
+
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ * the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ * attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
+static xfs_failaddr_t
+xfs_attr3_leaf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *ent;
+ char *buf_end;
+ uint32_t end; /* must be 32bit - see below */
+ __u32 last_hashval = 0;
+ int i;
+ xfs_failaddr_t fa;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
+
+ /*
+ * firstused is the block offset of the first name info structure.
+ * Make sure it doesn't go off the block or crash into the header.
+ */
+ if (ichdr.firstused > mp->m_attr_geo->blksize)
+ return __this_address;
+ if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf))
+ return __this_address;
+
+ /* Make sure the entries array doesn't crash into the name info. */
+ entries = xfs_attr3_leaf_entryp(bp->b_addr);
+ if ((char *)&entries[ichdr.count] >
+ (char *)bp->b_addr + ichdr.firstused)
+ return __this_address;
+
+ /*
+ * NOTE: This verifier historically failed empty leaf buffers because
+ * we expect the fork to be in another format. Empty attr fork format
+ * conversions are possible during xattr set, however, and format
+ * conversion is not atomic with the xattr set that triggers it. We
+ * cannot assume leaf blocks are non-empty until that is addressed.
+ */
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < ichdr.count; ent++, i++) {
+ fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr,
+ ent, i, &last_hashval);
+ if (fa)
+ return fa;
+ }
+
+ /*
+ * Quickly check the freemap information. Attribute data has to be
+ * aligned to 4-byte boundaries, and likewise for the free space.
+ *
+ * Note that for 64k block size filesystems, the freemap entries cannot
+ * overflow as they are only be16 fields. However, when checking end
+ * pointer of the freemap, we have to be careful to detect overflows and
+ * so use uint32_t for those checks.
+ */
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
+ return __this_address;
+ if (ichdr.freemap[i].base & 0x3)
+ return __this_address;
+ if (ichdr.freemap[i].size > mp->m_attr_geo->blksize)
+ return __this_address;
+ if (ichdr.freemap[i].size & 0x3)
+ return __this_address;
+
+ /* be care of 16 bit overflows here */
+ end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
+ if (end < ichdr.freemap[i].base)
+ return __this_address;
+ if (end > mp->m_attr_geo->blksize)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_attr3_leaf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_attr3_leaf_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
+ .verify_read = xfs_attr3_leaf_read_verify,
+ .verify_write = xfs_attr3_leaf_write_verify,
+ .verify_struct = xfs_attr3_leaf_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
+ &xfs_attr3_leaf_buf_ops);
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
+static bool
+xfs_attr_match(
+ struct xfs_da_args *args,
+ uint8_t namelen,
+ unsigned char *name,
+ int flags)
+{
+
+ if (args->namelen != namelen)
+ return false;
+ if (memcmp(args->name, name, namelen) != 0)
+ return false;
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
+ if (args->attr_filter !=
+ (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
+ return false;
+ return true;
+}
+
+static int
+xfs_attr_copy_value(
+ struct xfs_da_args *args,
+ unsigned char *value,
+ int valuelen)
+{
+ /*
+ * No copy if all we have to do is get the length
+ */
+ if (!args->valuelen) {
+ args->valuelen = valuelen;
+ return 0;
+ }
+
+ /*
+ * No copy if the length of the existing buffer is too small
+ */
+ if (args->valuelen < valuelen) {
+ args->valuelen = valuelen;
+ return -ERANGE;
+ }
+
+ if (!args->value) {
+ args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
+ if (!args->value)
+ return -ENOMEM;
+ }
+ args->valuelen = valuelen;
+
+ /* remote block xattr requires IO for copy-in */
+ if (args->rmtblkno)
+ return xfs_attr_rmtval_get(args);
+
+ /*
+ * This is to prevent a GCC warning because the remote xattr case
+ * doesn't have a value to pass in. In that case, we never reach here,
+ * but GCC can't work that out and so throws a "passing NULL to
+ * memcpy" warning.
+ */
+ if (!value)
+ return -EINVAL;
+ memcpy(args->value, value, valuelen);
+ return 0;
+}
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the total requested number of attr fork bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the i_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(
+ struct xfs_inode *dp,
+ int bytes)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int64_t dsize;
+ int minforkoff;
+ int maxforkoff;
+ int offset;
+
+ /*
+ * Check if the new size could fit at all first:
+ */
+ if (bytes > XFS_LITINO(mp))
+ return 0;
+
+ /* rounded down */
+ offset = (XFS_LITINO(mp) - bytes) >> 3;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) {
+ minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ return (offset >= minforkoff) ? minforkoff : 0;
+ }
+
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to real extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= xfs_inode_attr_fork_size(dp))
+ return dp->i_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixed attribute fork.
+ */
+ if (!xfs_has_attr2(mp))
+ return 0;
+
+ dsize = dp->i_df.if_bytes;
+
+ switch (dp->i_df.if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /*
+ * If there is no attr fork and the data fork is extents,
+ * determine if creating the default attr fork will result
+ * in the extents form migrating to btree. If so, the
+ * minimum offset only needs to be the space required for
+ * the btree root.
+ */
+ if (!dp->i_forkoff && dp->i_df.if_bytes >
+ xfs_default_attroffset(dp))
+ dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /*
+ * If we have a data btree then keep forkoff if we have one,
+ * otherwise we are adding a new attr, so then we set
+ * minforkoff to where the btree root can finish so we have
+ * plenty of room for attrs
+ */
+ if (dp->i_forkoff) {
+ if (offset < dp->i_forkoff)
+ return 0;
+ return dp->i_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+ break;
+ }
+
+ /*
+ * A data fork btree root must have space for at least
+ * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+ */
+ minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+ minforkoff = roundup(minforkoff, 8) >> 3;
+
+ /* attr fork btree root can have at least this many key/ptr pairs */
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = maxforkoff >> 3; /* rounded down */
+
+ if (offset >= maxforkoff)
+ return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
+ return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
+ * - noattr2 mount option is set,
+ * - on-disk version bit says it is already set, or
+ * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ */
+STATIC void
+xfs_sbversion_add_attr2(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp)
+{
+ if (xfs_has_noattr2(mp))
+ return;
+ if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ return;
+ if (!xfs_has_attr2(mp))
+ return;
+
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr2(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *hdr;
+
+ trace_xfs_attr_sf_create(args);
+
+ ASSERT(ifp->if_bytes == 0);
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+ hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Return -EEXIST if attr is found, or -ENOATTR if not
+ * args: args containing attribute name and namelen
+ * sfep: If not null, pointer will be set to the last attr entry found on
+ -EEXIST. On -ENOATTR pointer is left at the last entry in the list
+ * basep: If not null, pointer is set to the byte offset of the entry in the
+ * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
+ * the last entry in the list
+ */
+int
+xfs_attr_sf_findname(
+ struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int base = sizeof(struct xfs_attr_sf_hdr);
+ int size = 0;
+ int end;
+ int i;
+
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
+ sfe = &sf->list[0];
+ end = sf->hdr.count;
+ for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
+ base += size, i++) {
+ size = xfs_attr_sf_entsize(sfe);
+ if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ continue;
+ break;
+ }
+
+ if (sfep != NULL)
+ *sfep = sfe;
+
+ if (basep != NULL)
+ *basep = base;
+
+ if (i == end)
+ return -ENOATTR;
+ return -EEXIST;
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(
+ struct xfs_da_args *args,
+ int forkoff)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int offset, size;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ struct xfs_ifork *ifp;
+
+ trace_xfs_attr_sf_add(args);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ dp->i_forkoff = forkoff;
+
+ ifp = &dp->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
+ ASSERT(0);
+
+ offset = (char *)sfe - (char *)sf;
+ size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
+ xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+
+ sfe->namelen = args->namelen;
+ sfe->valuelen = args->valuelen;
+ sfe->flags = args->attr_filter;
+ memcpy(sfe->nameval, args->name, args->namelen);
+ memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+ sf->hdr.count++;
+ be16_add_cpu(&sf->hdr.totsize, size);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+ xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+void
+xfs_attr_fork_remove(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp)
+{
+ ASSERT(ip->i_af.if_nextents == 0);
+
+ xfs_ifork_zap_attr(ip);
+ ip->i_forkoff = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_sf_removename(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int size = 0, end, totsize;
+ unsigned int base;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ int error;
+
+ trace_xfs_attr_sf_remove(args);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+
+ error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
+ if (error != -EEXIST)
+ return error;
+ size = xfs_attr_sf_entsize(sfe);
+
+ /*
+ * Fix up the attribute fork data, covering the hole
+ */
+ end = base + size;
+ totsize = be16_to_cpu(sf->hdr.totsize);
+ if (end != totsize)
+ memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+ sf->hdr.count--;
+ be16_add_cpu(&sf->hdr.totsize, -size);
+
+ /*
+ * Fix up the start offset of the attribute fork
+ */
+ totsize -= size;
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
+ xfs_attr_fork_remove(dp, args->trans);
+ } else {
+ xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+ ASSERT(dp->i_forkoff);
+ ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ (args->op_flags & XFS_DA_OP_ADDNAME) ||
+ !xfs_has_attr2(mp) ||
+ dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
+ xfs_trans_log_inode(args->trans, dp,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ }
+
+ xfs_sbversion_add_attr2(mp, args->trans);
+
+ return 0;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int i;
+ struct xfs_ifork *ifp;
+
+ trace_xfs_attr_sf_lookup(args);
+
+ ifp = &args->dp->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count;
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return -EEXIST;
+ }
+ return -ENOATTR;
+}
+
+/*
+ * Retrieve the attribute value and length.
+ *
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
+ */
+int
+xfs_attr_shortform_getvalue(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int i;
+
+ ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count;
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return xfs_attr_copy_value(args,
+ &sfe->nameval[args->namelen], sfe->valuelen);
+ }
+ return -ENOATTR;
+}
+
+/* Convert from using the shortform to the leaf format. */
+int
+xfs_attr_shortform_to_leaf(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_da_args nargs;
+ char *tmpbuffer;
+ int error, i, size;
+ xfs_dablk_t blkno;
+ struct xfs_buf *bp;
+ struct xfs_ifork *ifp;
+
+ trace_xfs_attr_sf_to_leaf(args);
+
+ dp = args->dp;
+ ifp = &dp->i_af;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ size = be16_to_cpu(sf->hdr.totsize);
+ tmpbuffer = kmem_alloc(size, 0);
+ ASSERT(tmpbuffer != NULL);
+ memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+ sf = (struct xfs_attr_shortform *)tmpbuffer;
+
+ xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
+
+ bp = NULL;
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error)
+ goto out;
+
+ ASSERT(blkno == 0);
+ error = xfs_attr3_leaf_create(args, blkno, &bp);
+ if (error)
+ goto out;
+
+ memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.dp = dp;
+ nargs.geo = args->geo;
+ nargs.total = args->total;
+ nargs.whichfork = XFS_ATTR_FORK;
+ nargs.trans = args->trans;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count; i++) {
+ nargs.name = sfe->nameval;
+ nargs.namelen = sfe->namelen;
+ nargs.value = &sfe->nameval[nargs.namelen];
+ nargs.valuelen = sfe->valuelen;
+ nargs.hashval = xfs_da_hashname(sfe->nameval,
+ sfe->namelen);
+ nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
+ error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+ ASSERT(error == -ENOATTR);
+ error = xfs_attr3_leaf_add(bp, &nargs);
+ ASSERT(error != -ENOSPC);
+ if (error)
+ goto out;
+ sfe = xfs_attr_sf_nextentry(sfe);
+ }
+ error = 0;
+out:
+ kmem_free(tmpbuffer);
+ return error;
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ int bytes;
+ int i;
+ struct xfs_mount *mp = bp->b_mount;
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ bytes = sizeof(struct xfs_attr_sf_hdr);
+ for (i = 0; i < leafhdr.count; entry++, i++) {
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* don't copy partial entries */
+ if (!(entry->flags & XFS_ATTR_LOCAL))
+ return 0;
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return 0;
+ if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return 0;
+ bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
+ }
+ if (xfs_has_attr2(dp->i_mount) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+ (bytes == sizeof(struct xfs_attr_sf_hdr)))
+ return -1;
+ return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+
+/* Verify the consistency of an inline attribute fork. */
+xfs_failaddr_t
+xfs_attr_shortform_verify(
+ struct xfs_inode *ip)
+{
+ struct xfs_attr_shortform *sfp;
+ struct xfs_attr_sf_entry *sfep;
+ struct xfs_attr_sf_entry *next_sfep;
+ char *endp;
+ struct xfs_ifork *ifp;
+ int i;
+ int64_t size;
+
+ ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
+ sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ size = ifp->if_bytes;
+
+ /*
+ * Give up if the attribute is way too short.
+ */
+ if (size < sizeof(struct xfs_attr_sf_hdr))
+ return __this_address;
+
+ endp = (char *)sfp + size;
+
+ /* Check all reported entries */
+ sfep = &sfp->list[0];
+ for (i = 0; i < sfp->hdr.count; i++) {
+ /*
+ * struct xfs_attr_sf_entry has a variable length.
+ * Check the fixed-offset parts of the structure are
+ * within the data buffer.
+ * xfs_attr_sf_entry is defined with a 1-byte variable
+ * array at the end, so we must subtract that off.
+ */
+ if (((char *)sfep + sizeof(*sfep)) >= endp)
+ return __this_address;
+
+ /* Don't allow names with known bad length. */
+ if (sfep->namelen == 0)
+ return __this_address;
+
+ /*
+ * Check that the variable-length part of the structure is
+ * within the data buffer. The next entry starts after the
+ * name component, so nextentry is an acceptable test.
+ */
+ next_sfep = xfs_attr_sf_nextentry(sfep);
+ if ((char *)next_sfep > endp)
+ return __this_address;
+
+ /*
+ * Check for unknown flags. Short form doesn't support
+ * the incomplete or local bits, so we can use the namespace
+ * mask here.
+ */
+ if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK)
+ return __this_address;
+
+ /*
+ * Check for invalid namespace combinations. We only allow
+ * one namespace flag per xattr, so we can just count the
+ * bits (i.e. hweight) here.
+ */
+ if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1)
+ return __this_address;
+
+ sfep = next_sfep;
+ }
+ if ((void *)sfep != (void *)endp)
+ return __this_address;
+
+ return NULL;
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args,
+ int forkoff)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_da_args nargs;
+ struct xfs_inode *dp = args->dp;
+ char *tmpbuffer;
+ int error;
+ int i;
+
+ trace_xfs_attr_leaf_to_sf(args);
+
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ if (!tmpbuffer)
+ return -ENOMEM;
+
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+
+ leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ /* XXX (dgc): buffer is about to be marked stale - why zero it? */
+ memset(bp->b_addr, 0, args->geo->blksize);
+
+ /*
+ * Clean out the prior contents of the attribute list.
+ */
+ error = xfs_da_shrink_inode(args, 0, bp);
+ if (error)
+ goto out;
+
+ if (forkoff == -1) {
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
+ goto out;
+ }
+
+ xfs_attr_shortform_create(args);
+
+ /*
+ * Copy the attributes
+ */
+ memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.geo = args->geo;
+ nargs.dp = dp;
+ nargs.total = args->total;
+ nargs.whichfork = XFS_ATTR_FORK;
+ nargs.trans = args->trans;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* don't copy partial entries */
+ if (!entry->nameidx)
+ continue;
+ ASSERT(entry->flags & XFS_ATTR_LOCAL);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ nargs.name = name_loc->nameval;
+ nargs.namelen = name_loc->namelen;
+ nargs.value = &name_loc->nameval[nargs.namelen];
+ nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+ nargs.hashval = be32_to_cpu(entry->hashval);
+ nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK;
+ xfs_attr_shortform_add(&nargs, forkoff);
+ }
+ error = 0;
+
+out:
+ kmem_free(tmpbuffer);
+ return error;
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr icleafhdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_da3_icnode_hdr icnodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp1 = NULL;
+ struct xfs_buf *bp2 = NULL;
+ xfs_dablk_t blkno;
+ int error;
+
+ trace_xfs_attr_leaf_to_node(args);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error)
+ goto out;
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
+ if (error)
+ goto out;
+
+ error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK);
+ if (error)
+ goto out;
+
+ /* copy leaf to new buffer, update identifiers */
+ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+ bp2->b_ops = bp1->b_ops;
+ memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
+ }
+ xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+
+ /*
+ * Set up the new root node.
+ */
+ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+ if (error)
+ goto out;
+ node = bp1->b_addr;
+ xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node);
+
+ leaf = bp2->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ /* both on-disk, don't endian-flip twice */
+ icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ icnodehdr.btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.count = 1;
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr);
+ xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+ error = 0;
+out:
+ return error;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ trace_xfs_attr_leaf_create(args);
+
+ error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+ leaf = bp->b_addr;
+ memset(leaf, 0, args->geo->blksize);
+
+ memset(&ichdr, 0, sizeof(ichdr));
+ ichdr.firstused = args->geo->blksize;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+
+ ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+ } else {
+ ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+ ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+ }
+ ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
+{
+ xfs_dablk_t blkno;
+ int error;
+
+ trace_xfs_attr_leaf_split(state->args);
+
+ /*
+ * Allocate space for a new leaf node.
+ */
+ ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+ error = xfs_da_grow_inode(state->args, &blkno);
+ if (error)
+ return error;
+ error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+ if (error)
+ return error;
+ newblk->blkno = blkno;
+ newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+ /*
+ * Rebalance the entries across the two leaves.
+ * NOTE: rebalance() currently depends on the 2nd block being empty.
+ */
+ xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
+ if (error)
+ return error;
+
+ /*
+ * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+ * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+ * "new" attrs info. Will need the "old" info to remove it later.
+ *
+ * Insert the "new" entry in the correct block.
+ */
+ if (state->inleaf) {
+ trace_xfs_attr_leaf_add_old(state->args);
+ error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+ } else {
+ trace_xfs_attr_leaf_add_new(state->args);
+ error = xfs_attr3_leaf_add(newblk->bp, state->args);
+ }
+
+ /*
+ * Update last hashval in each block since we added the name.
+ */
+ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+ newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+ return error;
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ int tablesize;
+ int entsize;
+ int sum;
+ int tmp;
+ int i;
+
+ trace_xfs_attr_leaf_add(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ ASSERT(args->index >= 0 && args->index <= ichdr.count);
+ entsize = xfs_attr_leaf_newentsize(args, NULL);
+
+ /*
+ * Search through freemap for first-fit on new name length.
+ * (may need to figure in size of entry struct too)
+ */
+ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+ if (tablesize > ichdr.firstused) {
+ sum += ichdr.freemap[i].size;
+ continue;
+ }
+ if (!ichdr.freemap[i].size)
+ continue; /* no space in this map */
+ tmp = entsize;
+ if (ichdr.freemap[i].base < ichdr.firstused)
+ tmp += sizeof(xfs_attr_leaf_entry_t);
+ if (ichdr.freemap[i].size >= tmp) {
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+ goto out_log_hdr;
+ }
+ sum += ichdr.freemap[i].size;
+ }
+
+ /*
+ * If there are no holes in the address space of the block,
+ * and we don't have enough freespace, then compaction will do us
+ * no good and we should just give up.
+ */
+ if (!ichdr.holes && sum < entsize)
+ return -ENOSPC;
+
+ /*
+ * Compact the entries to coalesce free space.
+ * This may change the hdr->count via dropping INCOMPLETE entries.
+ */
+ xfs_attr3_leaf_compact(args, &ichdr, bp);
+
+ /*
+ * After compaction, the block is guaranteed to have only one
+ * free region, in freemap[0]. If it is not big enough, give up.
+ */
+ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+ tmp = -ENOSPC;
+ goto out_log_hdr;
+ }
+
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
+ return tmp;
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+ struct xfs_buf *bp,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args,
+ int mapindex)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_mount *mp;
+ int tmp;
+ int i;
+
+ trace_xfs_attr_leaf_add_work(args);
+
+ leaf = bp->b_addr;
+ ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+ ASSERT(args->index >= 0 && args->index <= ichdr->count);
+
+ /*
+ * Force open some space in the entry array and fill it in.
+ */
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ if (args->index < ichdr->count) {
+ tmp = ichdr->count - args->index;
+ tmp *= sizeof(xfs_attr_leaf_entry_t);
+ memmove(entry + 1, entry, tmp);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+ }
+ ichdr->count++;
+
+ /*
+ * Allocate space for the new string (at the end of the run).
+ */
+ mp = args->trans->t_mountp;
+ ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+ ASSERT(ichdr->freemap[mapindex].size >=
+ xfs_attr_leaf_newentsize(args, NULL));
+ ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+ ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+ entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+ ichdr->freemap[mapindex].size);
+ entry->hashval = cpu_to_be32(args->hashval);
+ entry->flags = args->attr_filter;
+ if (tmp)
+ entry->flags |= XFS_ATTR_LOCAL;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ if ((args->blkno2 == args->blkno) &&
+ (args->index2 <= args->index)) {
+ args->index2++;
+ }
+ }
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+ ASSERT((args->index == 0) ||
+ (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+ ASSERT((args->index == ichdr->count - 1) ||
+ (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+ /*
+ * For "remote" attribute values, simply note that we need to
+ * allocate space for the "remote" value. We can't actually
+ * allocate the extents in this transaction, and we can't decide
+ * which blocks they should be as we might allocate more blocks
+ * as part of this transaction (a split operation for example).
+ */
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+ name_loc->namelen = args->namelen;
+ name_loc->valuelen = cpu_to_be16(args->valuelen);
+ memcpy((char *)name_loc->nameval, args->name, args->namelen);
+ memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+ be16_to_cpu(name_loc->valuelen));
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ name_rmt->namelen = args->namelen;
+ memcpy((char *)name_rmt->name, args->name, args->namelen);
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ /* just in case */
+ name_rmt->valuelen = 0;
+ name_rmt->valueblk = 0;
+ args->rmtblkno = 1;
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ args->rmtvaluelen = args->valuelen;
+ }
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+ xfs_attr_leaf_entsize(leaf, args->index)));
+
+ /*
+ * Update the control info for this leaf node
+ */
+ if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+ ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+ ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf));
+ tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (ichdr->freemap[i].base == tmp) {
+ ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+ ichdr->freemap[i].size -=
+ min_t(uint16_t, ichdr->freemap[i].size,
+ sizeof(xfs_attr_leaf_entry_t));
+ }
+ }
+ ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+ return 0;
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+ struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr_dst,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr_leafblock *leaf_src;
+ struct xfs_attr_leafblock *leaf_dst;
+ struct xfs_attr3_icleaf_hdr ichdr_src;
+ struct xfs_trans *trans = args->trans;
+ char *tmpbuffer;
+
+ trace_xfs_attr_leaf_compact(args);
+
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+ memset(bp->b_addr, 0, args->geo->blksize);
+ leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_dst = bp->b_addr;
+
+ /*
+ * Copy the on-disk header back into the destination buffer to ensure
+ * all the information in the header that is not part of the incore
+ * header structure is preserved.
+ */
+ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+ /* Initialise the incore headers */
+ ichdr_src = *ichdr_dst; /* struct copy */
+ ichdr_dst->firstused = args->geo->blksize;
+ ichdr_dst->usedbytes = 0;
+ ichdr_dst->count = 0;
+ ichdr_dst->holes = 0;
+ ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+ ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+ ichdr_dst->freemap[0].base;
+
+ /* write the header back to initialise the underlying buffer */
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
+
+ /*
+ * Copy all entry's in the same (sorted) order,
+ * but allocate name/value pairs packed and in sequence.
+ */
+ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+ leaf_dst, ichdr_dst, 0, ichdr_src.count);
+ /*
+ * this logs the entire buffer, but the caller must write the header
+ * back to the buffer when it is finished modifying it.
+ */
+ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+
+ kmem_free(tmpbuffer);
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_attr3_icleaf_hdr *leaf1hdr,
+ struct xfs_buf *leaf2_bp,
+ struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+
+ entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+ entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+ if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+ ((be32_to_cpu(entries2[0].hashval) <
+ be32_to_cpu(entries1[0].hashval)) ||
+ (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+ be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+int
+xfs_attr_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp)
+{
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_mount *mp = leaf1_bp->b_mount;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
+ return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block. At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block. Those
+ * values are used in "atomic rename" operations on attributes. Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
+{
+ struct xfs_da_args *args;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+ int count;
+ int totallen;
+ int max;
+ int space;
+ int swap;
+
+ /*
+ * Set up environment.
+ */
+ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
+ ASSERT(ichdr2.count == 0);
+ args = state->args;
+
+ trace_xfs_attr_leaf_rebalance(args);
+
+ /*
+ * Check ordering of blocks, reverse if it makes things simpler.
+ *
+ * NOTE: Given that all (current) callers pass in an empty
+ * second block, this code should never set "swap".
+ */
+ swap = 0;
+ if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+ swap(blk1, blk2);
+
+ /* swap structures rather than reconverting them */
+ swap(ichdr1, ichdr2);
+
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
+ swap = 1;
+ }
+
+ /*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum. Then get
+ * the direction to copy and the number of elements to move.
+ *
+ * "inleaf" is true if the new entry should be inserted into blk1.
+ * If "swap" is also true, then reverse the sense of "inleaf".
+ */
+ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+ blk2, &ichdr2,
+ &count, &totallen);
+ if (swap)
+ state->inleaf = !state->inleaf;
+
+ /*
+ * Move any entries required from leaf to leaf:
+ */
+ if (count < ichdr1.count) {
+ /*
+ * Figure the total bytes to be added to the destination leaf.
+ */
+ /* number entries being moved */
+ count = ichdr1.count - count;
+ space = ichdr1.usedbytes - totallen;
+ space += count * sizeof(xfs_attr_leaf_entry_t);
+
+ /*
+ * leaf2 is the destination, compact it if it looks tight.
+ */
+ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max)
+ xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+
+ /*
+ * Move high entries from leaf1 to low end of leaf2.
+ */
+ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+ ichdr1.count - count, leaf2, &ichdr2, 0, count);
+
+ } else if (count > ichdr1.count) {
+ /*
+ * I assert that since all callers pass in an empty
+ * second buffer, this code should never execute.
+ */
+ ASSERT(0);
+
+ /*
+ * Figure the total bytes to be added to the destination leaf.
+ */
+ /* number entries being moved */
+ count -= ichdr1.count;
+ space = totallen - ichdr1.usedbytes;
+ space += count * sizeof(xfs_attr_leaf_entry_t);
+
+ /*
+ * leaf1 is the destination, compact it if it looks tight.
+ */
+ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max)
+ xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+
+ /*
+ * Move low entries from leaf2 to high end of leaf1.
+ */
+ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count);
+ }
+
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
+ /*
+ * Copy out last hashval in each block for B-tree code.
+ */
+ entries1 = xfs_attr3_leaf_entryp(leaf1);
+ entries2 = xfs_attr3_leaf_entryp(leaf2);
+ blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+
+ /*
+ * Adjust the expected index for insertion.
+ * NOTE: this code depends on the (current) situation that the
+ * second block was originally empty.
+ *
+ * If the insertion point moved to the 2nd block, we must adjust
+ * the index. We must also track the entry just following the
+ * new entry for use in an "atomic rename" operation, that entry
+ * is always the "old" entry and the "new" entry is what we are
+ * inserting. The index/blkno fields refer to the "old" entry,
+ * while the index2/blkno2 fields refer to the "new" entry.
+ */
+ if (blk1->index > ichdr1.count) {
+ ASSERT(state->inleaf == 0);
+ blk2->index = blk1->index - ichdr1.count;
+ args->index = args->index2 = blk2->index;
+ args->blkno = args->blkno2 = blk2->blkno;
+ } else if (blk1->index == ichdr1.count) {
+ if (state->inleaf) {
+ args->index = blk1->index;
+ args->blkno = blk1->blkno;
+ args->index2 = 0;
+ args->blkno2 = blk2->blkno;
+ } else {
+ /*
+ * On a double leaf split, the original attr location
+ * is already stored in blkno2/index2, so don't
+ * overwrite it overwise we corrupt the tree.
+ */
+ blk2->index = blk1->index - ichdr1.count;
+ args->index = blk2->index;
+ args->blkno = blk2->blkno;
+ if (!state->extravalid) {
+ /*
+ * set the new attr location to match the old
+ * one and let the higher level split code
+ * decide where in the leaf to place it.
+ */
+ args->index2 = blk2->index;
+ args->blkno2 = blk2->blkno;
+ }
+ }
+ } else {
+ ASSERT(state->inleaf == 1);
+ args->index = args->index2 = blk1->index;
+ args->blkno = args->blkno2 = blk1->blkno;
+ }
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary? With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ struct xfs_da_state_blk *blk2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *countarg,
+ int *usedbytesarg)
+{
+ struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr;
+ struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ int count;
+ int max;
+ int index;
+ int totallen = 0;
+ int half;
+ int lastdelta;
+ int foundit = 0;
+ int tmp;
+
+ /*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ */
+ max = ichdr1->count + ichdr2->count;
+ half = (max + 1) * sizeof(*entry);
+ half += ichdr1->usedbytes + ichdr2->usedbytes +
+ xfs_attr_leaf_newentsize(state->args, NULL);
+ half /= 2;
+ lastdelta = state->args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf1);
+ for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+ /*
+ * The new entry is in the first block, account for it.
+ */
+ if (count == blk1->index) {
+ tmp = totallen + sizeof(*entry) +
+ xfs_attr_leaf_newentsize(state->args, NULL);
+ if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+ break;
+ lastdelta = XFS_ATTR_ABS(half - tmp);
+ totallen = tmp;
+ foundit = 1;
+ }
+
+ /*
+ * Wrap around into the second block if necessary.
+ */
+ if (count == ichdr1->count) {
+ leaf1 = leaf2;
+ entry = xfs_attr3_leaf_entryp(leaf1);
+ index = 0;
+ }
+
+ /*
+ * Figure out if next leaf entry would be too much.
+ */
+ tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+ index);
+ if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+ break;
+ lastdelta = XFS_ATTR_ABS(half - tmp);
+ totallen = tmp;
+#undef XFS_ATTR_ABS
+ }
+
+ /*
+ * Calculate the number of usedbytes that will end up in lower block.
+ * If new entry not in lower block, fix up the count.
+ */
+ totallen -= count * sizeof(*entry);
+ if (foundit) {
+ totallen -= sizeof(*entry) +
+ xfs_attr_leaf_newentsize(state->args, NULL);
+ }
+
+ *countarg = count;
+ *usedbytesarg = totallen;
+ return foundit;
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+ struct xfs_da_state *state,
+ int *action)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_buf *bp;
+ xfs_dablk_t blkno;
+ int bytes;
+ int forward;
+ int error;
+ int retval;
+ int i;
+
+ trace_xfs_attr_leaf_toosmall(state->args);
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ leaf = blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
+ bytes = xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+ ichdr.usedbytes;
+ if (bytes > (state->args->geo->blksize >> 1)) {
+ *action = 0; /* blk over 50%, don't try to join */
+ return 0;
+ }
+
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (ichdr.count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (ichdr.forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 2;
+ }
+ return 0;
+ }
+
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink an attribute list over time.
+ */
+ /* start with smaller blk num */
+ forward = ichdr.forw < ichdr.back;
+ for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ if (forward)
+ blkno = ichdr.forw;
+ else
+ blkno = ichdr.back;
+ if (blkno == 0)
+ continue;
+ error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+ blkno, &bp);
+ if (error)
+ return error;
+
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
+
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2) -
+ ichdr.usedbytes - ichdr2.usedbytes -
+ ((ichdr.count + ichdr2.count) *
+ sizeof(xfs_attr_leaf_entry_t)) -
+ xfs_attr3_leaf_hdr_size(leaf);
+
+ xfs_trans_brelse(state->args->trans, bp);
+ if (bytes >= 0)
+ break; /* fits with at least 25% to spare */
+ }
+ if (i >= 2) {
+ *action = 0;
+ return 0;
+ }
+
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno) {
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ } else {
+ error = xfs_da3_path_shift(state, &state->path, forward,
+ 0, &retval);
+ }
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 1;
+ }
+ return 0;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ int before;
+ int after;
+ int smallest;
+ int entsize;
+ int tablesize;
+ int tmp;
+ int i;
+
+ trace_xfs_attr_leaf_remove(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+
+ ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index >= 0 && args->index < ichdr.count);
+ ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+ xfs_attr3_leaf_hdr_size(leaf));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+ /*
+ * Scan through free region table:
+ * check for adjacency of free'd entry with an existing one,
+ * find smallest free region in case we need to replace it,
+ * adjust any map that borders the entry table,
+ */
+ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ tmp = ichdr.freemap[0].size;
+ before = after = -1;
+ smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+ entsize = xfs_attr_leaf_entsize(leaf, args->index);
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+ ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+ if (ichdr.freemap[i].base == tablesize) {
+ ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+ ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+ }
+
+ if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+ be16_to_cpu(entry->nameidx)) {
+ before = i;
+ } else if (ichdr.freemap[i].base ==
+ (be16_to_cpu(entry->nameidx) + entsize)) {
+ after = i;
+ } else if (ichdr.freemap[i].size < tmp) {
+ tmp = ichdr.freemap[i].size;
+ smallest = i;
+ }
+ }
+
+ /*
+ * Coalesce adjacent freemap regions,
+ * or replace the smallest region.
+ */
+ if ((before >= 0) || (after >= 0)) {
+ if ((before >= 0) && (after >= 0)) {
+ ichdr.freemap[before].size += entsize;
+ ichdr.freemap[before].size += ichdr.freemap[after].size;
+ ichdr.freemap[after].base = 0;
+ ichdr.freemap[after].size = 0;
+ } else if (before >= 0) {
+ ichdr.freemap[before].size += entsize;
+ } else {
+ ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[after].size += entsize;
+ }
+ } else {
+ /*
+ * Replace smallest region (if it is smaller than free'd entry)
+ */
+ if (ichdr.freemap[smallest].size < entsize) {
+ ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[smallest].size = entsize;
+ }
+ }
+
+ /*
+ * Did we remove the first entry?
+ */
+ if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+ smallest = 1;
+ else
+ smallest = 0;
+
+ /*
+ * Compress the remaining entries and zero out the removed stuff.
+ */
+ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+ ichdr.usedbytes -= entsize;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+ entsize));
+
+ tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+ memmove(entry, entry + 1, tmp);
+ ichdr.count--;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+ memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+ /*
+ * If we removed the first entry, re-find the first used byte
+ * in the name area. Note that if the entry was the "firstused",
+ * then we don't have a "hole" in our block resulting from
+ * removing the name.
+ */
+ if (smallest) {
+ tmp = args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+ if (be16_to_cpu(entry->nameidx) < tmp)
+ tmp = be16_to_cpu(entry->nameidx);
+ }
+ ichdr.firstused = tmp;
+ ASSERT(ichdr.firstused != 0);
+ } else {
+ ichdr.holes = 1; /* mark as needing compaction */
+ }
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
+
+ /*
+ * Check if leaf is less than 50% full, caller may want to
+ * "join" the leaf with a sibling if so.
+ */
+ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+ return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
+{
+ struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+ struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+ struct xfs_attr3_icleaf_hdr drophdr;
+ struct xfs_attr3_icleaf_hdr savehdr;
+ struct xfs_attr_leaf_entry *entry;
+
+ trace_xfs_attr_leaf_unbalance(state->args);
+
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
+ entry = xfs_attr3_leaf_entryp(drop_leaf);
+
+ /*
+ * Save last hashval from dying block for later Btree fixup.
+ */
+ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+
+ /*
+ * Check if we need a temp buffer, or can we do it in place.
+ * Note that we don't check "leaf" for holes because we will
+ * always be dropping it, toosmall() decided that for us already.
+ */
+ if (savehdr.holes == 0) {
+ /*
+ * dest leaf has no holes, so we add there. May need
+ * to make some room in the entry array.
+ */
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr, 0,
+ drophdr.count);
+ } else {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr,
+ savehdr.count, drophdr.count);
+ }
+ } else {
+ /*
+ * Destination has holes, so we make a temporary copy
+ * of the leaf and add them both to that.
+ */
+ struct xfs_attr_leafblock *tmp_leaf;
+ struct xfs_attr3_icleaf_hdr tmphdr;
+
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+
+ /*
+ * Copy the header into the temp leaf so that all the stuff
+ * not in the incore header is present and gets copied back in
+ * once we've moved all the entries.
+ */
+ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+ memset(&tmphdr, 0, sizeof(tmphdr));
+ tmphdr.magic = savehdr.magic;
+ tmphdr.forw = savehdr.forw;
+ tmphdr.back = savehdr.back;
+ tmphdr.firstused = state->args->geo->blksize;
+
+ /* write the header to the temp buffer to initialise it */
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
+
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ drophdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ savehdr.count);
+ } else {
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ savehdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ drophdr.count);
+ }
+ memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+ savehdr = tmphdr; /* struct copy */
+ kmem_free(tmp_leaf);
+ }
+
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
+ xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+ state->args->geo->blksize - 1);
+
+ /*
+ * Copy out last hashval in each block for B-tree code.
+ */
+ entry = xfs_attr3_leaf_entryp(save_leaf);
+ save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node. The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ xfs_dahash_t hashval;
+ int probe;
+ int span;
+
+ trace_xfs_attr_leaf_lookup(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (ichdr.count >= args->geo->blksize / 8) {
+ xfs_buf_mark_corrupt(bp);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Binary search. (note: small blocks will skip this loop)
+ */
+ hashval = args->hashval;
+ probe = span = ichdr.count / 2;
+ for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+ span /= 2;
+ if (be32_to_cpu(entry->hashval) < hashval)
+ probe += span;
+ else if (be32_to_cpu(entry->hashval) > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
+ xfs_buf_mark_corrupt(bp);
+ return -EFSCORRUPTED;
+ }
+ if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
+ xfs_buf_mark_corrupt(bp);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Since we may have duplicate hashval's, find the first matching
+ * hashval in the leaf.
+ */
+ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+ entry--;
+ probe--;
+ }
+ while (probe < ichdr.count &&
+ be32_to_cpu(entry->hashval) < hashval) {
+ entry++;
+ probe++;
+ }
+ if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+ args->index = probe;
+ return -ENOATTR;
+ }
+
+ /*
+ * Duplicate keys may be present, so search all of them for a match.
+ */
+ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+ entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+ if (!xfs_attr_match(args, name_loc->namelen,
+ name_loc->nameval, entry->flags))
+ continue;
+ args->index = probe;
+ return -EEXIST;
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+ if (!xfs_attr_match(args, name_rmt->namelen,
+ name_rmt->name, entry->flags))
+ continue;
+ args->index = probe;
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(
+ args->dp->i_mount,
+ args->rmtvaluelen);
+ return -EEXIST;
+ }
+ }
+ args->index = probe;
+ return -ENOATTR;
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ *
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
+ */
+int
+xfs_attr3_leaf_getvalue(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index < ichdr.count);
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+ ASSERT(name_loc->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+ return xfs_attr_copy_value(args,
+ &name_loc->nameval[args->namelen],
+ be16_to_cpu(name_loc->valuelen));
+ }
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ ASSERT(name_rmt->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ args->rmtvaluelen);
+ return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+ struct xfs_da_args *args,
+ struct xfs_attr_leafblock *leaf_s,
+ struct xfs_attr3_icleaf_hdr *ichdr_s,
+ int start_s,
+ struct xfs_attr_leafblock *leaf_d,
+ struct xfs_attr3_icleaf_hdr *ichdr_d,
+ int start_d,
+ int count)
+{
+ struct xfs_attr_leaf_entry *entry_s;
+ struct xfs_attr_leaf_entry *entry_d;
+ int desti;
+ int tmp;
+ int i;
+
+ /*
+ * Check for nothing to do.
+ */
+ if (count == 0)
+ return;
+
+ /*
+ * Set up environment.
+ */
+ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+ ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+ ASSERT(ichdr_s->magic == ichdr_d->magic);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+ ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ + xfs_attr3_leaf_hdr_size(leaf_s));
+ ASSERT(ichdr_d->count < args->geo->blksize / 8);
+ ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ + xfs_attr3_leaf_hdr_size(leaf_d));
+
+ ASSERT(start_s < ichdr_s->count);
+ ASSERT(start_d <= ichdr_d->count);
+ ASSERT(count <= ichdr_s->count);
+
+
+ /*
+ * Move the entries in the destination leaf up to make a hole?
+ */
+ if (start_d < ichdr_d->count) {
+ tmp = ichdr_d->count - start_d;
+ tmp *= sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+ memmove(entry_d, entry_s, tmp);
+ }
+
+ /*
+ * Copy all entry's in the same (sorted) order,
+ * but allocate attribute info packed and in sequence.
+ */
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+ desti = start_d;
+ for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+ ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+ tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+ /*
+ * Code to drop INCOMPLETE entries. Difficult to use as we
+ * may also need to change the insertion index. Code turned
+ * off for 6.2, should be revisited later.
+ */
+ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_s->count -= 1;
+ entry_d--; /* to compensate for ++ in loop hdr */
+ desti--;
+ if ((start_s + i) < offset)
+ result++; /* insertion index adjustment */
+ } else {
+#endif /* GROT */
+ ichdr_d->firstused -= tmp;
+ /* both on-disk, don't endian flip twice */
+ entry_d->hashval = entry_s->hashval;
+ entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+ entry_d->flags = entry_s->flags;
+ ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+ <= args->geo->blksize);
+ memmove(xfs_attr3_leaf_name(leaf_d, desti),
+ xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+ ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+ <= args->geo->blksize);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_d->usedbytes += tmp;
+ ichdr_s->count -= 1;
+ ichdr_d->count += 1;
+ tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf_d);
+ ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+ }
+#endif /* GROT */
+ }
+
+ /*
+ * Zero out the entries we just copied.
+ */
+ if (start_s == ichdr_s->count) {
+ tmp = count * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ ASSERT(((char *)entry_s + tmp) <=
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
+ } else {
+ /*
+ * Move the remaining entries down to fill the hole,
+ * then zero the entries at the top.
+ */
+ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ memmove(entry_d, entry_s, tmp);
+
+ tmp = count * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+ ASSERT(((char *)entry_s + tmp) <=
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
+ }
+
+ /*
+ * Fill in the freemap information
+ */
+ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+ ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+ ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+ ichdr_d->freemap[1].base = 0;
+ ichdr_d->freemap[2].base = 0;
+ ichdr_d->freemap[1].size = 0;
+ ichdr_d->freemap[2].size = 0;
+ ichdr_s->holes = 1; /* leaf may not be compact */
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+ struct xfs_buf *bp,
+ int *count)
+{
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_mount *mp = bp->b_mount;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
+ entries = xfs_attr3_leaf_entryp(bp->b_addr);
+ if (count)
+ *count = ichdr.count;
+ if (!ichdr.count)
+ return 0;
+ return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+ struct xfs_attr_leaf_entry *entries;
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ int size;
+
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (entries[index].flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, index);
+ size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+ size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+ }
+ return size;
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+ struct xfs_da_args *args,
+ int *local)
+{
+ int size;
+
+ size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+ if (local)
+ *local = 1;
+ return size;
+ }
+ if (local)
+ *local = 0;
+ return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
+ int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
+ xfs_attr_leaf_name_local_t *name_loc;
+ int namelen;
+ char *name;
+#endif /* DEBUG */
+
+ trace_xfs_attr_leaf_clearflag(args);
+ /*
+ * Set up the operation.
+ */
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ if (error)
+ return error;
+
+ leaf = bp->b_addr;
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
+ ASSERT(args->index >= 0);
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+ namelen = name_loc->namelen;
+ name = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ namelen = name_rmt->namelen;
+ name = (char *)name_rmt->name;
+ }
+ ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+ ASSERT(namelen == args->namelen);
+ ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+ entry->flags &= ~XFS_ATTR_INCOMPLETE;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+ if (args->rmtblkno) {
+ ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+ }
+
+ return 0;
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
+ int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+ trace_xfs_attr_leaf_setflag(args);
+
+ /*
+ * Set up the operation.
+ */
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ if (error)
+ return error;
+
+ leaf = bp->b_addr;
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
+ ASSERT(args->index >= 0);
+#endif
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+ ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+ if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ name_rmt->valueblk = 0;
+ name_rmt->valuelen = 0;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+ }
+
+ return 0;
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr_leaf_entry *entry1;
+ struct xfs_attr_leaf_entry *entry2;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp1;
+ struct xfs_buf *bp2;
+ int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ xfs_attr_leaf_name_local_t *name_loc;
+ int namelen1, namelen2;
+ char *name1, *name2;
+#endif /* DEBUG */
+
+ trace_xfs_attr_leaf_flipflags(args);
+
+ /*
+ * Read the block containing the "old" attr
+ */
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
+ if (error)
+ return error;
+
+ /*
+ * Read the block containing the "new" attr, if it is different
+ */
+ if (args->blkno2 != args->blkno) {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+ &bp2);
+ if (error)
+ return error;
+ } else {
+ bp2 = bp1;
+ }
+
+ leaf1 = bp1->b_addr;
+ entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+ leaf2 = bp2->b_addr;
+ entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
+ ASSERT(args->index < ichdr1.count);
+ ASSERT(args->index >= 0);
+
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
+ ASSERT(args->index2 < ichdr2.count);
+ ASSERT(args->index2 >= 0);
+
+ if (entry1->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+ namelen1 = name_loc->namelen;
+ name1 = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+ namelen1 = name_rmt->namelen;
+ name1 = (char *)name_rmt->name;
+ }
+ if (entry2->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+ namelen2 = name_loc->namelen;
+ name2 = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+ namelen2 = name_rmt->namelen;
+ name2 = (char *)name_rmt->name;
+ }
+ ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+ ASSERT(namelen1 == namelen2);
+ ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+ ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+ ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+ entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+ xfs_trans_log_buf(args->trans, bp1,
+ XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+ if (args->rmtblkno) {
+ ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+ xfs_trans_log_buf(args->trans, bp1,
+ XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+ }
+
+ entry2->flags |= XFS_ATTR_INCOMPLETE;
+ xfs_trans_log_buf(args->trans, bp2,
+ XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+ if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+ name_rmt->valueblk = 0;
+ name_rmt->valuelen = 0;
+ xfs_trans_log_buf(args->trans, bp2,
+ XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644
index 000000000..368f4d9fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+
+struct attrlist;
+struct xfs_attr_list_context;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Incore version of the attribute leaf header.
+ */
+struct xfs_attr3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t usedbytes;
+ /*
+ * Firstused is 32-bit here instead of 16-bit like the on-disk variant
+ * to support maximum fsb size of 64k without overflow issues throughout
+ * the attr code. Instead, the overflow condition is handled on
+ * conversion to/from disk.
+ */
+ uint32_t firstused;
+ __u8 holes;
+ struct {
+ uint16_t base;
+ uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void xfs_attr_shortform_create(struct xfs_da_args *args);
+void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_attr_sf_removename(struct xfs_da_args *args);
+int xfs_attr_sf_findname(struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep);
+int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
+int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
+ struct xfs_da_args *args, int forkoff);
+int xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int xfs_attr3_leaf_split(struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk);
+int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
+ struct xfs_da_args *args);
+int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
+ struct xfs_da_args *args);
+int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
+ struct xfs_da_args *args);
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+ struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk);
+/*
+ * Utility routines.
+ */
+xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
+int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, struct xfs_buf **bpp);
+void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from);
+void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from);
+
+#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000..d440393b4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,712 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+
+/*
+ * Remote Attribute Values
+ * =======================
+ *
+ * Remote extended attribute values are conceptually simple -- they're written
+ * to data blocks mapped by an inode's attribute fork, and they have an upper
+ * size limit of 64k. Setting a value does not involve the XFS log.
+ *
+ * However, on a v5 filesystem, maximally sized remote attr values require one
+ * block more than 64k worth of space to hold both the remote attribute value
+ * header (64 bytes). On a 4k block filesystem this results in a 68k buffer;
+ * on a 64k block filesystem, this would be a 128k buffer. Note that the log
+ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
+ * Therefore, we /must/ ensure that remote attribute value buffers never touch
+ * the logging system and therefore never have a log item.
+ */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+ struct xfs_mount *mp,
+ int attrlen)
+{
+ if (xfs_has_crc(mp)) {
+ int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+ return (attrlen + buflen - 1) / buflen;
+ }
+ return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static xfs_failaddr_t
+xfs_attr3_rmt_hdr_ok(
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (bno != be64_to_cpu(rmt->rm_blkno))
+ return __this_address;
+ if (offset != be32_to_cpu(rmt->rm_offset))
+ return __this_address;
+ if (size != be32_to_cpu(rmt->rm_bytes))
+ return __this_address;
+ if (ino != be64_to_cpu(rmt->rm_owner))
+ return __this_address;
+
+ /* ok */
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_attr3_rmt_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ void *ptr,
+ int fsbsize,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
+ return __this_address;
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(rmt->rm_blkno) != bno)
+ return __this_address;
+ if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ return __this_address;
+ if (be32_to_cpu(rmt->rm_offset) +
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
+ return __this_address;
+ if (rmt->rm_owner == 0)
+ return __this_address;
+
+ return NULL;
+}
+
+static int
+__xfs_attr3_rmt_read_verify(
+ struct xfs_buf *bp,
+ bool check_crc,
+ xfs_failaddr_t *failaddr)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_has_crc(mp))
+ return 0;
+
+ ptr = bp->b_addr;
+ bno = xfs_buf_daddr(bp);
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ if (check_crc &&
+ !xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+ *failaddr = __this_address;
+ return -EFSBADCRC;
+ }
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ if (*failaddr)
+ return -EFSCORRUPTED;
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+
+ if (len != 0) {
+ *failaddr = __this_address;
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+ int error;
+
+ error = __xfs_attr3_rmt_read_verify(bp, true, &fa);
+ if (error)
+ xfs_verifier_error(bp, error, fa);
+}
+
+static xfs_failaddr_t
+xfs_attr3_rmt_verify_struct(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+ int error;
+
+ error = __xfs_attr3_rmt_read_verify(bp, false, &fa);
+ return error ? fa : NULL;
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+ int blksize = mp->m_attr_geo->blksize;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_has_crc(mp))
+ return;
+
+ ptr = bp->b_addr;
+ bno = xfs_buf_daddr(bp);
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ /*
+ * Ensure we aren't writing bogus LSNs to disk. See
+ * xfs_attr3_rmt_hdr_set() for the explanation.
+ */
+ if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+ return;
+ }
+ xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+
+ if (len != 0)
+ xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
+ .verify_read = xfs_attr3_rmt_read_verify,
+ .verify_write = xfs_attr3_rmt_write_verify,
+ .verify_struct = xfs_attr3_rmt_verify_struct,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_has_crc(mp))
+ return 0;
+
+ rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+ rmt->rm_offset = cpu_to_be32(offset);
+ rmt->rm_bytes = cpu_to_be32(size);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
+ rmt->rm_owner = cpu_to_be64(ino);
+ rmt->rm_blkno = cpu_to_be64(bno);
+
+ /*
+ * Remote attribute blocks are written synchronously, so we don't
+ * have an LSN that we can stamp in them that makes any sense to log
+ * recovery. To ensure that log recovery handles overwrites of these
+ * blocks sanely (i.e. once they've been freed and reallocated as some
+ * other type of metadata) we need to ensure that the LSN has a value
+ * that tells log recovery to ignore the LSN and overwrite the buffer
+ * with whatever is in it's log. To do this, we use the magic
+ * NULLCOMMITLSN to indicate that the LSN is invalid.
+ */
+ rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN);
+
+ return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ uint8_t **dst)
+{
+ char *src = bp->b_addr;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size = 0;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+
+ if (xfs_has_crc(mp)) {
+ if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+ byte_cnt, bno)) {
+ xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+ bno, *offset, byte_cnt, ino);
+ return -EFSCORRUPTED;
+ }
+ hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+ }
+
+ memcpy(*dst, src + hdr_size, byte_cnt);
+
+ /* roll buffer forwards */
+ len -= blksize;
+ src += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *dst += byte_cnt;
+ *offset += byte_cnt;
+ }
+ return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ uint8_t **src)
+{
+ char *dst = bp->b_addr;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+ hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+ byte_cnt, bno);
+
+ memcpy(dst + hdr_size, *src, byte_cnt);
+
+ /*
+ * If this is the last block, zero the remainder of it.
+ * Check that we are actually the last block, too.
+ */
+ if (byte_cnt + hdr_size < blksize) {
+ ASSERT(*valuelen - byte_cnt == 0);
+ ASSERT(len == blksize);
+ memset(dst + hdr_size + byte_cnt, 0,
+ blksize - hdr_size - byte_cnt);
+ }
+
+ /* roll buffer forwards */
+ len -= blksize;
+ dst += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *src += byte_cnt;
+ *offset += byte_cnt;
+ }
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
+int
+xfs_attr_rmtval_get(
+ struct xfs_da_args *args)
+{
+ struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_buf *bp;
+ xfs_dablk_t lblkno = args->rmtblkno;
+ uint8_t *dst = args->value;
+ int valuelen;
+ int nmap;
+ int error;
+ int blkcnt = args->rmtblkcnt;
+ int i;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_get(args);
+
+ ASSERT(args->valuelen != 0);
+ ASSERT(args->rmtvaluelen == args->valuelen);
+
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ nmap = ATTR_RMTVALUE_MAPSIZE;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap >= 1);
+
+ for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+ (map[i].br_startblock != HOLESTARTBLOCK));
+ dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
+ 0, &bp, &xfs_attr3_rmt_buf_ops);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ &offset, &valuelen,
+ &dst);
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+ /* roll attribute extent map forwards */
+ lblkno += map[i].br_blockcount;
+ blkcnt -= map[i].br_blockcount;
+ }
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Find a "hole" in the attribute address space large enough for us to drop the
+ * new attributes value into
+ */
+int
+xfs_attr_rmt_find_hole(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
+ int blkcnt;
+ xfs_fileoff_t lfileoff = 0;
+
+ /*
+ * Because CRC enable attributes have headers, we can't just do a
+ * straight byte to FSB conversion and have to take the header space
+ * into account.
+ */
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ args->rmtblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ return 0;
+}
+
+int
+xfs_attr_rmtval_set_value(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ uint8_t *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
+
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
+
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+
+ /* roll attribute extent map forwards */
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/* Mark stale any incore buffers for the remote value. */
+int
+xfs_attr_rmtval_stale(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ return -EFSCORRUPTED;
+
+ error = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map->br_startblock),
+ XFS_FSB_TO_BB(mp, map->br_blockcount),
+ incore_flags, &bp);
+ if (error) {
+ if (error == -ENOENT)
+ return 0;
+ return error;
+ }
+
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+/*
+ * Find a hole for the attr and store it in the delayed attr context. This
+ * initializes the context to roll through allocating an attr extent for a
+ * delayed attr operation
+ */
+int
+xfs_attr_rmtval_find_space(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
+ int error;
+
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
+ args->rmtblkcnt = 0;
+ args->rmtblkno = 0;
+ memset(map, 0, sizeof(struct xfs_bmbt_irec));
+
+ error = xfs_attr_rmt_find_hole(args);
+ if (error)
+ return error;
+
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
+
+ return 0;
+}
+
+/*
+ * Write one block of the value associated with an attribute into the
+ * out-of-line buffer that we have defined for it. This is similar to a subset
+ * of xfs_attr_rmtval_set, but records the current block to the delayed attr
+ * context, and leaves transaction handling to the caller.
+ */
+int
+xfs_attr_rmtval_set_blk(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
+ int nmap;
+ int error;
+
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ map, &nmap);
+ if (error)
+ return error;
+
+ ASSERT(nmap == 1);
+ ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
+ (map->br_startblock != HOLESTARTBLOCK));
+
+ /* roll attribute extent map forwards */
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
+
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_invalidate(
+ struct xfs_da_args *args)
+{
+ xfs_dablk_t lblkno;
+ int blkcnt;
+ int error;
+
+ /*
+ * Roll through the "value", invalidating the attribute value's blocks.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ while (blkcnt > 0) {
+ struct xfs_bmbt_irec map;
+ int nmap;
+
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ return -EFSCORRUPTED;
+ error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
+ if (error)
+ return error;
+
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the out-of-line
+ * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the
+ * transaction and re-call the function. Callers should keep calling this
+ * routine until it returns something other than -EAGAIN.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error, done;
+
+ /*
+ * Unmap value blocks for this attr.
+ */
+ error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno,
+ args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done);
+ if (error)
+ return error;
+
+ /*
+ * We don't need an explicit state here to pick up where we left off. We
+ * can figure it out using the !done return code. The actual value of
+ * attr->xattri_dela_state may be some value reminiscent of the calling
+ * function, but it's value is irrelevant with in the context of this
+ * function. Once we are done here, the next state is set as needed by
+ * the parent
+ */
+ if (!done) {
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
+ return -EAGAIN;
+ }
+
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644
index 000000000..d097ec6c4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define __XFS_ATTR_REMOTE_H__
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags);
+int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr);
+int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr);
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644
index 000000000..37578b369
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+ uint8_t entno; /* entry number in original list */
+ uint8_t namelen; /* length of name value (no null) */
+ uint8_t valuelen; /* length of value */
+ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ xfs_dahash_t hash; /* this entry's hash value */
+ unsigned char *name; /* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
+ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+ return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+ return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
+
+#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
new file mode 100644
index 000000000..40ce5f309
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_bit.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+ uint i;
+
+ for (i = 0; i < size; i++) {
+ if (map[i] != 0)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit. Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint size, uint start_bit)
+{
+ uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+ uint result = 0;
+ uint tmp;
+
+ size <<= BIT_TO_WORD_SHIFT;
+
+ ASSERT(start_bit < size);
+ size -= start_bit & ~(NBWORD - 1);
+ start_bit &= (NBWORD - 1);
+ if (start_bit) {
+ tmp = *p++;
+ /* set to one first offset bits prior to start */
+ tmp |= (~0U >> (NBWORD-start_bit));
+ if (tmp != ~0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ while (size) {
+ if ((tmp = *p++) != ~0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ return result - start_bit;
+found:
+ return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+ uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+ uint result = start_bit & ~(NBWORD - 1);
+ uint tmp;
+
+ size <<= BIT_TO_WORD_SHIFT;
+
+ if (start_bit >= size)
+ return -1;
+ size -= result;
+ start_bit &= (NBWORD - 1);
+ if (start_bit) {
+ tmp = *p++;
+ /* set to zero first offset bits prior to start */
+ tmp &= (~0U << start_bit);
+ if (tmp != 0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ while (size) {
+ if ((tmp = *p++) != 0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ return -1;
+found:
+ return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
new file mode 100644
index 000000000..a04f266ae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline uint64_t xfs_mask64hi(int n)
+{
+ return (uint64_t)-1 << (64 - (n));
+}
+static inline uint32_t xfs_mask32lo(int n)
+{
+ return ((uint32_t)1 << (n)) - 1;
+}
+static inline uint64_t xfs_mask64lo(int n)
+{
+ return ((uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(uint32_t v)
+{
+ return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(uint64_t v)
+{
+ return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(uint32_t v)
+{
+ return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(uint64_t v)
+{
+ uint32_t w = (uint32_t)v;
+ int n = 0;
+
+ if (w) { /* lower bits */
+ n = ffs(w);
+ } else { /* upper bits */
+ w = (uint32_t)(v >> 32);
+ if (w) {
+ n = ffs(w);
+ if (n)
+ n += 32;
+ }
+ }
+ return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000..49d0d4ea6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,6230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_filestream.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
+#include "xfs_icache.h"
+#include "xfs_iomap.h"
+
+struct kmem_cache *xfs_bmap_intent_cache;
+
+/*
+ * Miscellaneous helper functions
+ */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+ xfs_mount_t *mp, /* file system mount structure */
+ int whichfork) /* data or attr fork */
+{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
+ int level; /* btree level */
+ int maxrootrecs; /* max records in root block */
+ int minleafrecs; /* min records in leaf block */
+ int minnoderecs; /* min records in node block */
+ int sz; /* root block size */
+
+ /*
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that the
+ * fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted with
+ * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed
+ * but probably at various positions. Therefore, for both ATTR1 and
+ * ATTR2 we have to assume the worst case scenario of a minimum size
+ * available.
+ */
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ else
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+
+ maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+ minleafrecs = mp->m_bmap_dmnr[0];
+ minnoderecs = mp->m_bmap_dmnr[1];
+ maxblocks = howmany_64(maxleafents, minleafrecs);
+ for (level = 1; maxblocks > 1; level++) {
+ if (maxblocks <= maxrootrecs)
+ maxblocks = 1;
+ else
+ maxblocks = howmany_64(maxblocks, minnoderecs);
+ }
+ mp->m_bm_maxlevels[whichfork] = level;
+ ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
+}
+
+unsigned int
+xfs_bmap_compute_attr_offset(
+ struct xfs_mount *mp)
+{
+ if (mp->m_sb.sb_inodesize == 256)
+ return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *irec,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b = *irec;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_first(
+ struct xfs_btree_cur *cur,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = 0;
+ cur->bc_rec.b.br_startblock = 0;
+ cur->bc_rec.b.br_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+ return whichfork != XFS_COW_FORK &&
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+ return whichfork != XFS_COW_FORK &&
+ ifp->if_format == XFS_DINODE_FMT_BTREE &&
+ ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given by irec
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *irec)
+{
+ union xfs_btree_rec rec;
+
+ xfs_bmbt_disk_set_all(&rec.bmbt, irec);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
+{
+ int level; /* btree level number */
+ int maxrecs; /* maximum record count at this level */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t rval; /* return value */
+
+ mp = ip->i_mount;
+ maxrecs = mp->m_bmap_dmxr[0];
+ for (level = 0, rval = 0;
+ level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+ level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ rval += len;
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ level - 1;
+ if (level == 0)
+ maxrecs = mp->m_bmap_dmxr[1];
+ }
+ return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ if (ip->i_df.if_format == XFS_DINODE_FMT_DEV)
+ return roundup(sizeof(xfs_dev_t), 8);
+ return M_IGEO(ip->i_mount)->attr_fork_offset;
+}
+
+/*
+ * Helper routine to reset inode i_forkoff field when switching attribute fork
+ * from local to extent format - we reset it where possible to make space
+ * available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ if (whichfork == XFS_ATTR_FORK &&
+ ip->i_df.if_format != XFS_DINODE_FMT_DEV &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_forkoff)
+ ip->i_forkoff = dfl_forkoff;
+ }
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t bno)
+{
+ struct xfs_log_item *lip;
+ int i;
+
+ if (!cur)
+ return NULL;
+
+ for (i = 0; i < cur->bc_maxlevels; i++) {
+ if (!cur->bc_levels[i].bp)
+ break;
+ if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+ return cur->bc_levels[i].bp;
+ }
+
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) {
+ struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
+
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ xfs_buf_daddr(bip->bli_buf) == bno)
+ return bip->bli_buf;
+ }
+
+ return NULL;
+}
+
+STATIC void
+xfs_check_block(
+ struct xfs_btree_block *block,
+ xfs_mount_t *mp,
+ int root,
+ short sz)
+{
+ int i, j, dmxr;
+ __be64 *pp, *thispa; /* pointer to block address */
+ xfs_bmbt_key_t *prevp, *keyp;
+
+ ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+ prevp = NULL;
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+ dmxr = mp->m_bmap_dmxr[0];
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+ if (prevp) {
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
+ }
+ prevp = keyp;
+
+ /*
+ * Compare the block numbers to see if there are dups.
+ */
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+ for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ if (*thispa == *pp) {
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
+ __func__, j, i,
+ (unsigned long long)be64_to_cpu(*thispa));
+ xfs_err(mp, "%s: ptrs are equal in node\n",
+ __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ }
+ }
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
+ */
+STATIC void
+xfs_bmap_check_leaf_extents(
+ struct xfs_btree_cur *cur, /* btree cursor or null */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ struct xfs_buf *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_extnum_t i=0, j; /* index into the extents list */
+ int level; /* btree level, for checking */
+ __be64 *pp; /* pointer to block address */
+ xfs_bmbt_rec_t *ep; /* pointer to current extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
+ xfs_bmbt_rec_t *nextp; /* pointer to next extent */
+ int bp_release = 0;
+
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
+ return;
+
+ /* skip large extent count inodes */
+ if (ip->i_df.if_nextents > 10000)
+ return;
+
+ bno = NULLFSBLOCK;
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ /* See if buf is in cur first */
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ if (level == 0)
+ break;
+
+ /*
+ * Check this block for basic sanity (increasing keys and
+ * no duplicate blocks).
+ */
+
+ xfs_check_block(block, mp, 0, 0);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ i = 0;
+
+ /*
+ * Loop over all leaf nodes checking that all extents are in the right order.
+ */
+ for (;;) {
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ /*
+ * Check all the extents to make sure they are OK.
+ * If we had a previous block, the last entry should
+ * conform with the first entry in this one.
+ */
+
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ if (i) {
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
+ }
+ for (j = 1; j < num_recs; j++) {
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
+ ep = nextp;
+ }
+
+ last = *ep;
+ i += num_recs;
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+
+ return;
+
+error0:
+ xfs_warn(mp, "%s: at error0", __func__);
+ if (bp_release)
+ xfs_trans_brelse(NULL, bp);
+error_norelse:
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
+ __func__, i);
+ xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return;
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters. Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ uint32_t flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap)
+{
+ int i; /* index to map values */
+
+ ASSERT(ret_nmap <= nmap);
+
+ for (i = 0; i < ret_nmap; i++) {
+ ASSERT(mval[i].br_blockcount > 0);
+ if (!(flags & XFS_BMAPI_ENTIRE)) {
+ ASSERT(mval[i].br_startoff >= bno);
+ ASSERT(mval[i].br_blockcount <= len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+ bno + len);
+ } else {
+ ASSERT(mval[i].br_startoff < bno + len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+ bno);
+ }
+ ASSERT(i == 0 ||
+ mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+ mval[i].br_startoff);
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+ mval[i].br_state == XFS_EXT_UNWRITTEN);
+ }
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
+#endif /* DEBUG */
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
+ struct xfs_btree_block *cblock;/* child btree block */
+ xfs_fsblock_t cbno; /* child block number */
+ struct xfs_buf *cbp; /* child block's buffer */
+ int error; /* error return value */
+ __be64 *pp; /* ptr to block address */
+ struct xfs_owner_info oinfo;
+
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
+ ASSERT(whichfork != XFS_COW_FORK);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
+ ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+ ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ cbno = be64_to_cpu(*pp);
+#ifdef DEBUG
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ return -EFSCORRUPTED;
+#endif
+ error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+ return error;
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+ ip->i_nblocks--;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, cbp);
+ if (cur->bc_levels[0].bp == cbp)
+ cur->bc_levels[0].bp = NULL;
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int /* error */
+xfs_bmap_extents_to_btree(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur **curp, /* cursor returned to caller */
+ int wasdel, /* converting a delayed alloc */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
+ struct xfs_buf *abp; /* buffer for ablock */
+ struct xfs_alloc_arg args; /* allocation arguments */
+ struct xfs_bmbt_rec *arp; /* child record pointer */
+ struct xfs_btree_block *block; /* btree root block */
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
+ int error; /* error return value */
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ struct xfs_bmbt_key *kp; /* root block key pointer */
+ struct xfs_mount *mp; /* mount structure */
+ xfs_bmbt_ptr_t *pp; /* root block address pointer */
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ xfs_extnum_t cnt = 0;
+
+ mp = ip->i_mount;
+ ASSERT(whichfork != XFS_COW_FORK);
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
+
+ /*
+ * Make space in the inode incore. This needs to be undone if we fail
+ * to expand the root.
+ */
+ xfs_iroot_realloc(ip, 1, whichfork);
+
+ /*
+ * Fill in the root.
+ */
+ block = ifp->if_broot;
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+ /*
+ * Need a cursor. Can't allocate until bb_level is filled in.
+ */
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ /*
+ * Convert to a btree with two levels, one record in root.
+ */
+ ifp->if_format = XFS_DINODE_FMT_BTREE;
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = mp;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
+ if (tp->t_firstblock == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (tp->t_flags & XFS_TRANS_LOWMODE) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = tp->t_firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = tp->t_firstblock;
+ }
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = wasdel;
+ *logflagsp = 0;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_root_realloc;
+
+ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+ error = -ENOSPC;
+ goto out_root_realloc;
+ }
+
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+ ASSERT(tp->t_firstblock == NULLFSBLOCK ||
+ args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
+ tp->t_firstblock = args.fsbno;
+ cur->bc_ino.allocated++;
+ ip->i_nblocks++;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, args.fsbno),
+ mp->m_bsize, 0, &abp);
+ if (error)
+ goto out_unreserve_dquot;
+
+ /*
+ * Fill in the child block.
+ */
+ abp->b_ops = &xfs_bmbt_buf_ops;
+ ablock = XFS_BUF_TO_BLOCK(abp);
+ xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
+ XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+ xfs_bmbt_disk_set_all(arp, &rec);
+ cnt++;
+ }
+ ASSERT(cnt == ifp->if_nextents);
+ xfs_btree_set_numrecs(ablock, cnt);
+
+ /*
+ * Fill in the root key and pointer.
+ */
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
+ *pp = cpu_to_be64(args.fsbno);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ ASSERT(*curp == NULL);
+ *curp = cur;
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+ return 0;
+
+out_unreserve_dquot:
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+out_root_realloc:
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ASSERT(ifp->if_broot == NULL);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+
+ return error;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+ ASSERT(whichfork != XFS_COW_FORK);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_bytes == 0);
+ ASSERT(ifp->if_nextents == 0);
+
+ xfs_bmap_forkoff_reset(ip, whichfork);
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork,
+ void (*init_fn)(struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp))
+{
+ int error = 0;
+ int flags; /* logging flags returned */
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ struct xfs_buf *bp; /* buffer for extent block */
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+
+ /*
+ * We don't want to deal with the case of keeping inode data inline yet.
+ * So sending the data fork of a regular inode is invalid.
+ */
+ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ if (!ifp->if_bytes) {
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
+ flags = XFS_ILOG_CORE;
+ goto done;
+ }
+
+ flags = 0;
+ error = 0;
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (tp->t_firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.fsbno = tp->t_firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
+
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ tp->t_firstblock = args.fsbno;
+ error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(args.mp, args.fsbno),
+ args.mp->m_bsize, 0, &bp);
+ if (error)
+ goto done;
+
+ /*
+ * Initialize the block, copy the data and log the remote buffer.
+ *
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
+ */
+ init_fn(tp, bp, ip, ifp);
+
+ /* account for the change in fork size */
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
+ flags |= XFS_ILOG_CORE;
+
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+
+ rec.br_startoff = 0;
+ rec.br_startblock = args.fsbno;
+ rec.br_blockcount = 1;
+ rec.br_state = XFS_EXT_NORM;
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &rec, 0);
+
+ ifp->if_nextents = 1;
+ ip->i_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int *flags) /* inode logging flags */
+{
+ struct xfs_btree_block *block = ip->i_df.if_broot;
+ struct xfs_btree_cur *cur; /* btree cursor */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* file system mount struct */
+ int stat; /* newroot status */
+
+ mp = ip->i_mount;
+
+ if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
+ *flags |= XFS_ILOG_DBROOT;
+ else {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ error = xfs_bmbt_lookup_first(cur, &stat);
+ if (error)
+ goto error0;
+ /* must be at least one entry */
+ if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+ goto error0;
+ if (stat == 0) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return -ENOSPC;
+ }
+ cur->bc_ino.allocated = 0;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ }
+ return 0;
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_extents(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ int *flags) /* inode logging flags */
+{
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
+ int error; /* error return value */
+
+ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
+ xfs_inode_data_fork_size(ip))
+ return 0;
+ cur = NULL;
+ error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
+ XFS_DATA_FORK);
+ if (cur) {
+ cur->bc_ino.allocated = 0;
+ xfs_btree_del_cursor(cur, error);
+ }
+ return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_local(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ int *flags) /* inode logging flags */
+{
+ struct xfs_da_args dargs; /* args for dir/attr code */
+
+ if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
+ return 0;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ memset(&dargs, 0, sizeof(dargs));
+ dargs.geo = ip->i_mount->m_dir_geo;
+ dargs.dp = ip;
+ dargs.total = dargs.geo->fsbcount;
+ dargs.whichfork = XFS_DATA_FORK;
+ dargs.trans = tp;
+ return xfs_dir2_sf_to_block(&dargs);
+ }
+
+ if (S_ISLNK(VFS_I(ip)->i_mode))
+ return xfs_bmap_local_to_extents(tp, ip, 1, flags,
+ XFS_DATA_FORK,
+ xfs_symlink_local_to_remote);
+
+ /* should only be called for types that support local format data */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Set an inode attr fork offset based on the format of the data fork.
+ */
+static int
+xfs_bmap_set_attrforkoff(
+ struct xfs_inode *ip,
+ int size,
+ int *version)
+{
+ int default_size = xfs_default_attroffset(ip) >> 3;
+
+ switch (ip->i_df.if_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_forkoff = default_size;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_forkoff)
+ ip->i_forkoff = default_size;
+ else if (xfs_has_attr2(ip->i_mount) && version)
+ *version = 2;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
+{
+ xfs_mount_t *mp; /* mount structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ int blks; /* space reservation */
+ int version = 1; /* superblock attr version */
+ int logflags; /* logging flags */
+ int error; /* error return value */
+
+ ASSERT(xfs_inode_has_attr_fork(ip) == 0);
+
+ mp = ip->i_mount;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
+ if (error)
+ return error;
+ if (xfs_inode_has_attr_fork(ip))
+ goto trans_cancel;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ if (error)
+ goto trans_cancel;
+
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+ logflags = 0;
+ switch (ip->i_df.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_bmap_add_attrfork_local(tp, ip, &logflags);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_bmap_add_attrfork_extents(tp, ip, &logflags);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &logflags);
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (error)
+ goto trans_cancel;
+ if (!xfs_has_attr(mp) ||
+ (!xfs_has_attr2(mp) && version == 2)) {
+ bool log_sb = false;
+
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_has_attr(mp)) {
+ xfs_add_attr(mp);
+ log_sb = true;
+ }
+ if (!xfs_has_attr2(mp) && version == 2) {
+ xfs_add_attr2(mp);
+ log_sb = true;
+ }
+ spin_unlock(&mp->m_sb_lock);
+ if (log_sb)
+ xfs_log_sb(tp);
+ }
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+struct xfs_iread_state {
+ struct xfs_iext_cursor icur;
+ xfs_extnum_t loaded;
+};
+
+/* Stuff every bmbt record from this block into the incore extent map. */
+static int
+xfs_iread_bmbt_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_iread_state *ir = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ struct xfs_bmbt_rec *frp;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t j;
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* Abort if we find more records than nextents. */
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) {
+ xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).",
+ (unsigned long long)ip->i_ino);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
+ sizeof(*block), __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ /* Copy records into the incore cache. */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
+ struct xfs_bmbt_irec new;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(frp, &new);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iread_extents(2)", frp,
+ sizeof(*frp), fa);
+ return -EFSCORRUPTED;
+ }
+ xfs_iext_insert(ip, &ir->icur, &new,
+ xfs_bmap_fork_to_state(whichfork));
+ trace_xfs_read_extent(ip, &ir->icur,
+ xfs_bmap_fork_to_state(whichfork), _THIS_IP_);
+ xfs_iext_next(ifp, &ir->icur);
+ }
+
+ return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ */
+int
+xfs_iread_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_iread_state ir;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_need_iread_extents(ifp))
+ return 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ ir.loaded = 0;
+ xfs_iext_first(ifp, &ir.icur);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block,
+ XFS_BTREE_VISIT_RECORDS, &ir);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+
+ if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ ASSERT(ir.loaded == xfs_iext_count(ifp));
+ return 0;
+out:
+ xfs_iext_destroy(ifp);
+ return error;
+}
+
+/*
+ * Returns the relative block number of the first unused block(s) in the given
+ * fork with at least "len" logically contiguous blocks free. This is the
+ * lowest-address hole if the fork has holes, else the first block past the end
+ * of fork. Return 0 if the fork is currently local (in-inode).
+ */
+int /* error */
+xfs_bmap_first_unused(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t lastaddr = 0;
+ xfs_fileoff_t lowest, max;
+ int error;
+
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
+ *first_unused = 0;
+ return 0;
+ }
+
+ ASSERT(xfs_ifork_has_extents(ifp));
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ lowest = max = *first_unused;
+ for_each_xfs_iext(ifp, &icur, &got) {
+ /*
+ * See if the hole before this extent will work.
+ */
+ if (got.br_startoff >= lowest + len &&
+ got.br_startoff - max >= len)
+ break;
+ lastaddr = got.br_startoff + got.br_blockcount;
+ max = XFS_FILEOFF_MAX(lastaddr, lowest);
+ }
+
+ *first_unused = max;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_before(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ *last_block = 0;
+ return 0;
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got))
+ *last_block = 0;
+ return 0;
+}
+
+int
+xfs_bmap_last_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *rec,
+ int *is_empty)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_iext_cursor icur;
+ int error;
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, rec))
+ *is_empty = 1;
+ else
+ *is_empty = 0;
+ return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ bma->aeof = false;
+ error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+ &is_empty);
+ if (error)
+ return error;
+
+ if (is_empty) {
+ bma->aeof = true;
+ return 0;
+ }
+
+ /*
+ * Check if we are allocation or past the last extent, or at least into
+ * the last delayed allocated extent.
+ */
+ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+ (bma->offset >= rec.br_startoff &&
+ isnullstartblock(rec.br_startblock));
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *last_block,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ *last_block = 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
+ return 0;
+
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+ return -EFSCORRUPTED;
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+ if (error || is_empty)
+ return error;
+
+ *last_block = rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_delay_real(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
+ struct xfs_bmbt_irec *new = &bma->got;
+ int error; /* error return value */
+ int i; /* temp state */
+ xfs_fileoff_t new_endoff; /* end offset of new entry */
+ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ int rval=0; /* return value (logging flags) */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t da_new; /* new count del alloc blocks used */
+ xfs_filblks_t da_old; /* old count del alloc blocks used */
+ xfs_filblks_t temp=0; /* value for da_new calculations */
+ int tmp_rval; /* partial logging flags */
+ struct xfs_bmbt_irec old;
+
+ ASSERT(whichfork != XFS_ATTR_FORK);
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!bma->cur ||
+ (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+
+ XFS_STATS_INC(mp, xs_add_exlist);
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+
+ /*
+ * Set up a bunch of variables to make the tests simpler.
+ */
+ xfs_iext_get_extent(ifp, &bma->icur, &PREV);
+ new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(isnullstartblock(PREV.br_startblock));
+ ASSERT(PREV.br_startoff <= new->br_startoff);
+ ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+ da_old = startblockval(PREV.br_startblock);
+ da_new = 0;
+
+ /*
+ * Set flags determining what part of the previous delayed allocation
+ * extent is being replaced by a real allocation.
+ */
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ */
+ if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == new->br_state &&
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ /*
+ * Check and set flags if this segment has a right neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ * Also check for all-three-contiguous being too large.
+ */
+ if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ new->br_state == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= XFS_MAX_BMBT_EXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ error = 0;
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The left and right neighbors are both contiguous with new.
+ */
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+ ifp->if_nextents--;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_delete(bma->cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(bma->cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(bma->cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The left neighbor is contiguous, the right is not.
+ */
+ old = LEFT;
+ LEFT.br_blockcount += PREV.br_blockcount;
+
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(bma->cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The right neighbor is contiguous, the left is not. Take care
+ * with delay -> unwritten extent allocation here because the
+ * delalloc record we are overwriting is always written.
+ */
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(bma->cur, &PREV);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ ifp->if_nextents++;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Filling in the first part of a previous delayed allocation.
+ * The left neighbor is contiguous.
+ */
+ old = LEFT;
+ temp = PREV.br_blockcount - new->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+
+ LEFT.br_blockcount += new->br_blockcount;
+
+ PREV.br_blockcount = temp;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(bma->cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING:
+ /*
+ * Filling in the first part of a previous delayed allocation.
+ * The left neighbor is not contiguous.
+ */
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
+ ifp->if_nextents++;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ &bma->cur, 1, &tmp_rval, whichfork);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+
+ temp = PREV.br_blockcount - new->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock) -
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
+
+ PREV.br_startoff = new_endoff;
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ break;
+
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in the last part of a previous delayed allocation.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ old = RIGHT;
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(bma->cur, &RIGHT);
+ if (error)
+ goto done;
+ }
+
+ temp = PREV.br_blockcount - new->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
+ break;
+
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Filling in the last part of a previous delayed allocation.
+ * The right neighbor is not contiguous.
+ */
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
+ ifp->if_nextents++;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ &bma->cur, 1, &tmp_rval, whichfork);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+
+ temp = PREV.br_blockcount - new->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock) -
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
+
+ PREV.br_startblock = nullstartblock(da_new);
+ PREV.br_blockcount = temp;
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_next(ifp, &bma->icur);
+ break;
+
+ case 0:
+ /*
+ * Filling in the middle part of a previous delayed allocation.
+ * Contiguity is impossible here.
+ * This case is avoided almost all the time.
+ *
+ * We start with a delayed allocation:
+ *
+ * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+ * PREV @ idx
+ *
+ * and we are allocating:
+ * +rrrrrrrrrrrrrrrrr+
+ * new
+ *
+ * and we set it up for insertion as:
+ * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+ * new
+ * PREV @ idx LEFT RIGHT
+ * inserted at idx + 1
+ */
+ old = PREV;
+
+ /* LEFT is the new middle */
+ LEFT = *new;
+
+ /* RIGHT is the new right */
+ RIGHT.br_state = PREV.br_state;
+ RIGHT.br_startoff = new_endoff;
+ RIGHT.br_blockcount =
+ PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ RIGHT.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ RIGHT.br_blockcount));
+
+ /* truncate PREV */
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+ PREV.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ PREV.br_blockcount));
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
+ xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
+ ifp->if_nextents++;
+
+ if (bma->cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ &bma->cur, 1, &tmp_rval, whichfork);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+
+ da_new = startblockval(PREV.br_startblock) +
+ startblockval(RIGHT.br_startblock);
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ /* add reverse mapping unless caller opted out */
+ if (!(bma->flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(bma->cur == NULL);
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ &bma->cur, da_old > 0, &tmp_logflags,
+ whichfork);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ if (da_new != da_old)
+ xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+
+ if (bma->cur) {
+ da_new += bma->cur->bc_ino.allocated;
+ bma->cur->bc_ino.allocated = 0;
+ }
+
+ /* adjust for changes in reserved delayed indirect blocks */
+ if (da_new != da_old) {
+ ASSERT(state == 0 || da_new < da_old);
+ error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
+ false);
+ }
+
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+ if (whichfork != XFS_COW_FORK)
+ bma->logflags |= rval;
+ return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+int /* error */
+xfs_bmap_add_extent_unwritten_real(
+ struct xfs_trans *tp,
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_btree_cur **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp) /* inode logging flags */
+{
+ struct xfs_btree_cur *cur; /* btree cursor */
+ int error; /* error return value */
+ int i; /* temp state */
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_fileoff_t new_endoff; /* end offset of new entry */
+ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ int rval=0; /* return value (logging flags) */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec old;
+
+ *logflagsp = 0;
+
+ cur = *curp;
+ ifp = xfs_ifork_ptr(ip, whichfork);
+
+ ASSERT(!isnullstartblock(new->br_startblock));
+
+ XFS_STATS_INC(mp, xs_add_exlist);
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+
+ /*
+ * Set up a bunch of variables to make the tests simpler.
+ */
+ error = 0;
+ xfs_iext_get_extent(ifp, icur, &PREV);
+ ASSERT(new->br_state != PREV.br_state);
+ new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(PREV.br_startoff <= new->br_startoff);
+ ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == new->br_state &&
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ /*
+ * Check and set flags if this segment has a right neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ * Also check for all-three-contiguous being too large.
+ */
+ if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ new->br_state == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= XFS_MAX_BMBT_EXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
+ ifp->if_nextents -= 2;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ LEFT.br_blockcount += PREV.br_blockcount;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
+ ifp->if_nextents--;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
+
+ xfs_iext_next(ifp, icur);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ ifp->if_nextents--;
+
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ LEFT.br_blockcount += new->br_blockcount;
+
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_insert(ip, icur, new, state);
+ ifp->if_nextents++;
+
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ cur->bc_rec.b = *new;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+ break;
+
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &RIGHT);
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ error = xfs_bmbt_update(cur, &RIGHT);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, new, state);
+ ifp->if_nextents++;
+
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
+ goto done;
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ old = PREV;
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+
+ r[0] = *new;
+ r[1].br_startoff = new_endoff;
+ r[1].br_blockcount =
+ old.br_startoff + old.br_blockcount - new_endoff;
+ r[1].br_startblock = new->br_startblock + new->br_blockcount;
+ r[1].br_state = PREV.br_state;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &r[1], state);
+ xfs_iext_insert(ip, icur, &r[0], state);
+ ifp->if_nextents += 2;
+
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /* new right extent - oldext */
+ error = xfs_bmbt_update(cur, &r[1]);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ cur->bc_rec.b = PREV;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /* new middle extent - newext */
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ /* update reverse mappings */
+ xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+ &tmp_logflags, whichfork);
+ *logflagsp |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (cur) {
+ cur->bc_ino.allocated = 0;
+ *curp = cur;
+ }
+
+ xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+done:
+ *logflagsp |= rval;
+ return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ xfs_bmbt_irec_t *new) /* new data to add to file extents */
+{
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_filblks_t newlen=0; /* new indirect size */
+ xfs_filblks_t oldlen=0; /* old indirect size */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(isnullstartblock(new->br_startblock));
+
+ /*
+ * Check and set flags if this segment has a left neighbor
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if the current (right) segment exists.
+ * If it doesn't exist, we're converting the hole at end-of-file.
+ */
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * Set contiguity flags on the left and right neighbors.
+ * Don't let extents get too large, even if the pieces are contiguous.
+ */
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the contiguity flags.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with delayed allocations
+ * on the left and on the right.
+ * Merge all three into a single extent record.
+ */
+ temp = left.br_blockcount + new->br_blockcount +
+ right.br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ temp = left.br_blockcount + new->br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iext_insert(ip, icur, new, state);
+ break;
+ }
+ if (oldlen != newlen) {
+ ASSERT(oldlen > newlen);
+ xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
+ false);
+ /*
+ * Nothing to do for disk quota accounting here.
+ */
+ xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
+ }
+}
+
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_hole_real(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new,
+ int *logflagsp,
+ uint32_t flags)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur = *curp;
+ int error; /* error return value */
+ int i; /* temp state */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ int rval=0; /* return value (logging flags) */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
+
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+
+ XFS_STATS_INC(mp, xs_add_exlist);
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if this segment has a current value.
+ * Not true if we're inserting into the "hole" at eof.
+ */
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * We're inserting a real allocation between "left" and "right".
+ * Set the contiguity flags. Don't let extents get too large.
+ */
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_startblock + left.br_blockcount == new->br_startblock &&
+ left.br_state == new->br_state &&
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_startblock + new->br_blockcount == right.br_startblock &&
+ new->br_state == right.br_state &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ error = 0;
+ /*
+ * Select which case we're in here, and implement it.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with real allocations on the
+ * left and on the right.
+ * Merge all three into a single extent record.
+ */
+ left.br_blockcount += new->br_blockcount + right.br_blockcount;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ ifp->if_nextents--;
+
+ if (cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, &right, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &left);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a real allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ old = left;
+ left.br_blockcount += new->br_blockcount;
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+
+ if (cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &left);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a real allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ old = right;
+
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = new->br_startblock;
+ right.br_blockcount += new->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &right);
+
+ if (cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_bmbt_update(cur, &right);
+ if (error)
+ goto done;
+ }
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * real allocation.
+ * Insert a new entry.
+ */
+ xfs_iext_insert(ip, icur, new, state);
+ ifp->if_nextents++;
+
+ if (cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+ break;
+ }
+
+ /* add reverse mapping unless caller opted out */
+ if (!(flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(tp, ip, whichfork, new);
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, curp, 0,
+ &tmp_logflags, whichfork);
+ *logflagsp |= tmp_logflags;
+ cur = *curp;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (cur)
+ cur->bc_ino.allocated = 0;
+
+ xfs_bmap_check_leaf_extents(cur, ip, whichfork);
+done:
+ *logflagsp |= rval;
+ return error;
+}
+
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
+ * Adjust the size of the new extent based on i_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+ xfs_mount_t *mp,
+ xfs_bmbt_irec_t *gotp, /* next extent pointer */
+ xfs_bmbt_irec_t *prevp, /* previous extent pointer */
+ xfs_extlen_t extsz, /* align to this extent size */
+ int rt, /* is this a realtime inode? */
+ int eof, /* is extent at end-of-file? */
+ int delay, /* creating delalloc extent? */
+ int convert, /* overwriting unwritten extent? */
+ xfs_fileoff_t *offp, /* in/out: aligned offset */
+ xfs_extlen_t *lenp) /* in/out: aligned length */
+{
+ xfs_fileoff_t orig_off; /* original offset */
+ xfs_extlen_t orig_alen; /* original length */
+ xfs_fileoff_t orig_end; /* original off+len */
+ xfs_fileoff_t nexto; /* next file offset */
+ xfs_fileoff_t prevo; /* previous file offset */
+ xfs_fileoff_t align_off; /* temp for offset */
+ xfs_extlen_t align_alen; /* temp for length */
+ xfs_extlen_t temp; /* temp for calculations */
+
+ if (convert)
+ return 0;
+
+ orig_off = align_off = *offp;
+ orig_alen = align_alen = *lenp;
+ orig_end = orig_off + orig_alen;
+
+ /*
+ * If this request overlaps an existing extent, then don't
+ * attempt to perform any additional alignment.
+ */
+ if (!delay && !eof &&
+ (orig_off >= gotp->br_startoff) &&
+ (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+ return 0;
+ }
+
+ /*
+ * If the file offset is unaligned vs. the extent size
+ * we need to align it. This will be possible unless
+ * the file was previously written with a kernel that didn't
+ * perform this alignment, or if a truncate shot us in the
+ * foot.
+ */
+ div_u64_rem(orig_off, extsz, &temp);
+ if (temp) {
+ align_alen += temp;
+ align_off -= temp;
+ }
+
+ /* Same adjustment for the end of the requested area. */
+ temp = (align_alen % extsz);
+ if (temp)
+ align_alen += extsz - temp;
+
+ /*
+ * For large extent hint sizes, the aligned extent might be larger than
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
+ */
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
+ align_alen -= extsz;
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
+
+ /*
+ * If the previous block overlaps with this proposed allocation
+ * then move the start forward without adjusting the length.
+ */
+ if (prevp->br_startoff != NULLFILEOFF) {
+ if (prevp->br_startblock == HOLESTARTBLOCK)
+ prevo = prevp->br_startoff;
+ else
+ prevo = prevp->br_startoff + prevp->br_blockcount;
+ } else
+ prevo = 0;
+ if (align_off != orig_off && align_off < prevo)
+ align_off = prevo;
+ /*
+ * If the next block overlaps with this proposed allocation
+ * then move the start back without adjusting the length,
+ * but not before offset 0.
+ * This may of course make the start overlap previous block,
+ * and if we hit the offset 0 limit then the next block
+ * can still overlap too.
+ */
+ if (!eof && gotp->br_startoff != NULLFILEOFF) {
+ if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+ (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+ nexto = gotp->br_startoff + gotp->br_blockcount;
+ else
+ nexto = gotp->br_startoff;
+ } else
+ nexto = NULLFILEOFF;
+ if (!eof &&
+ align_off + align_alen != orig_end &&
+ align_off + align_alen > nexto)
+ align_off = nexto > align_alen ? nexto - align_alen : 0;
+ /*
+ * If we're now overlapping the next or previous extent that
+ * means we can't fit an extsz piece in this hole. Just move
+ * the start forward to the first valid spot and set
+ * the length so we hit the end.
+ */
+ if (align_off != orig_off && align_off < prevo)
+ align_off = prevo;
+ if (align_off + align_alen != orig_end &&
+ align_off + align_alen > nexto &&
+ nexto != NULLFILEOFF) {
+ ASSERT(nexto > prevo);
+ align_alen = nexto - align_off;
+ }
+
+ /*
+ * If realtime, and the result isn't a multiple of the realtime
+ * extent size we need to remove blocks until it is.
+ */
+ if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+ /*
+ * We're not covering the original request, or
+ * we won't be able to once we fix the length.
+ */
+ if (orig_off < align_off ||
+ orig_end > align_off + align_alen ||
+ align_alen - temp < orig_alen)
+ return -EINVAL;
+ /*
+ * Try to fix it by moving the start up.
+ */
+ if (align_off + temp <= orig_off) {
+ align_alen -= temp;
+ align_off += temp;
+ }
+ /*
+ * Try to fix it by moving the end in.
+ */
+ else if (align_off + align_alen - temp >= orig_end)
+ align_alen -= temp;
+ /*
+ * Set the start to the minimum then trim the length.
+ */
+ else {
+ align_alen -= orig_off - align_off;
+ align_off = orig_off;
+ align_alen -= align_alen % mp->m_sb.sb_rextsize;
+ }
+ /*
+ * Result doesn't cover the request, fail it.
+ */
+ if (orig_off < align_off || orig_end > align_off + align_alen)
+ return -EINVAL;
+ } else {
+ ASSERT(orig_off >= align_off);
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
+ ASSERT(orig_end <= align_off + align_alen ||
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
+ }
+
+#ifdef DEBUG
+ if (!eof && gotp->br_startoff != NULLFILEOFF)
+ ASSERT(align_off + align_alen <= gotp->br_startoff);
+ if (prevp->br_startoff != NULLFILEOFF)
+ ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+ *lenp = align_alen;
+ *offp = align_off;
+ return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS 4
+
+void
+xfs_bmap_adjacent(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+{
+ xfs_fsblock_t adjust; /* adjustment to block numbers */
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_mount_t *mp; /* mount point structure */
+ int nullfb; /* true if ap->firstblock isn't set */
+ int rt; /* true if inode is realtime */
+
+#define ISVALID(x,y) \
+ (rt ? \
+ (x) < mp->m_sb.sb_rblocks : \
+ XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+ XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+ XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+ mp = ap->ip->i_mount;
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+ rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ (ap->datatype & XFS_ALLOC_USERDATA);
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
+ ap->tp->t_firstblock);
+ /*
+ * If allocating at eof, and there's a previous real block,
+ * try to use its last block as our starting point.
+ */
+ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+ ap->prev.br_startblock)) {
+ ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+ /*
+ * Adjust for the gap between prevp and us.
+ */
+ adjust = ap->offset -
+ (ap->prev.br_startoff + ap->prev.br_blockcount);
+ if (adjust &&
+ ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+ ap->blkno += adjust;
+ }
+ /*
+ * If not at eof, then compare the two neighbor blocks.
+ * Figure out whether either one gives us a good starting point,
+ * and pick the better one.
+ */
+ else if (!ap->eof) {
+ xfs_fsblock_t gotbno; /* right side block number */
+ xfs_fsblock_t gotdiff=0; /* right side difference */
+ xfs_fsblock_t prevbno; /* left side block number */
+ xfs_fsblock_t prevdiff=0; /* left side difference */
+
+ /*
+ * If there's a previous (left) block, select a requested
+ * start block based on it.
+ */
+ if (ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ (prevbno = ap->prev.br_startblock +
+ ap->prev.br_blockcount) &&
+ ISVALID(prevbno, ap->prev.br_startblock)) {
+ /*
+ * Calculate gap to end of previous block.
+ */
+ adjust = prevdiff = ap->offset -
+ (ap->prev.br_startoff +
+ ap->prev.br_blockcount);
+ /*
+ * Figure the startblock based on the previous block's
+ * end and the gap size.
+ * Heuristic!
+ * If the gap is large relative to the piece we're
+ * allocating, or using it gives us an invalid block
+ * number, then just use the end of the previous block.
+ */
+ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+ ISVALID(prevbno + prevdiff,
+ ap->prev.br_startblock))
+ prevbno += adjust;
+ else
+ prevdiff += adjust;
+ /*
+ * If the firstblock forbids it, can't use it,
+ * must use default.
+ */
+ if (!rt && !nullfb &&
+ XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+ prevbno = NULLFSBLOCK;
+ }
+ /*
+ * No previous block or can't follow it, just default.
+ */
+ else
+ prevbno = NULLFSBLOCK;
+ /*
+ * If there's a following (right) block, select a requested
+ * start block based on it.
+ */
+ if (!isnullstartblock(ap->got.br_startblock)) {
+ /*
+ * Calculate gap to start of next block.
+ */
+ adjust = gotdiff = ap->got.br_startoff - ap->offset;
+ /*
+ * Figure the startblock based on the next block's
+ * start and the gap size.
+ */
+ gotbno = ap->got.br_startblock;
+ /*
+ * Heuristic!
+ * If the gap is large relative to the piece we're
+ * allocating, or using it gives us an invalid block
+ * number, then just use the start of the next block
+ * offset by our length.
+ */
+ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+ ISVALID(gotbno - gotdiff, gotbno))
+ gotbno -= adjust;
+ else if (ISVALID(gotbno - ap->length, gotbno)) {
+ gotbno -= ap->length;
+ gotdiff += adjust - ap->length;
+ } else
+ gotdiff += adjust;
+ /*
+ * If the firstblock forbids it, can't use it,
+ * must use default.
+ */
+ if (!rt && !nullfb &&
+ XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+ gotbno = NULLFSBLOCK;
+ }
+ /*
+ * No next block, just default.
+ */
+ else
+ gotbno = NULLFSBLOCK;
+ /*
+ * If both valid, pick the better one, else the only good
+ * one, else ap->blkno is already set (to 0 or the inode block).
+ */
+ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+ ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+ else if (prevbno != NULLFSBLOCK)
+ ap->blkno = prevbno;
+ else if (gotbno != NULLFSBLOCK)
+ ap->blkno = gotbno;
+ }
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t ag,
+ xfs_extlen_t *blen,
+ int *notinit)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest;
+ int error = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ if (!pag->pagf_init) {
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+ NULL);
+ if (error) {
+ /* Couldn't lock the AGF, so skip this AG. */
+ if (error == -EAGAIN) {
+ *notinit = 1;
+ error = 0;
+ }
+ goto out;
+ }
+ }
+
+ longest = xfs_alloc_longest_free_extent(pag,
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
+ if (*blen < longest)
+ *blen = longest;
+
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen,
+ int notinit)
+{
+ if (notinit || *blen < ap->minlen) {
+ /*
+ * Since we did a BUF_TRYLOCK above, it is possible that
+ * there is space for this request.
+ */
+ args->minlen = ap->minlen;
+ } else if (*blen < args->maxlen) {
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum.
+ */
+ args->minlen = *blen;
+ } else {
+ /*
+ * Otherwise we've seen an extent as big as maxlen, use that
+ * as the minimum.
+ */
+ args->minlen = args->maxlen;
+ }
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag, startag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_START_BNO;
+ args->total = ap->total;
+
+ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (startag == NULLAGNUMBER)
+ startag = ag = 0;
+
+ while (*blen < args->maxlen) {
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ if (++ag == mp->m_sb.sb_agcount)
+ ag = 0;
+ if (ag == startag)
+ break;
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ args->total = ap->total;
+
+ ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (ag == NULLAGNUMBER)
+ ag = 0;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+ if (error)
+ return error;
+
+ if (*blen < args->maxlen) {
+ error = xfs_filestream_new_ag(ap, &ag);
+ if (error)
+ return error;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+
+ /*
+ * Set the failure fallback case to look in the selected AG as stream
+ * may have moved.
+ */
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+ return 0;
+}
+
+/* Update all inode and quota accounting for the allocation we just did. */
+static void
+xfs_bmap_btalloc_accounting(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
+{
+ if (ap->flags & XFS_BMAPI_COWFORK) {
+ /*
+ * COW fork blocks are in-core only and thus are treated as
+ * in-core quota reservation (like delalloc blocks) even when
+ * converted to real blocks. The quota reservation is not
+ * accounted to disk until blocks are remapped to the data
+ * fork. So if these blocks were previously delalloc, we
+ * already have quota reservation and there's nothing to do
+ * yet.
+ */
+ if (ap->wasdel) {
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ return;
+ }
+
+ /*
+ * Otherwise, we've allocated blocks in a hole. The transaction
+ * has acquired in-core quota reservation for this extent.
+ * Rather than account these as real blocks, however, we reduce
+ * the transaction quota reservation based on the allocation.
+ * This essentially transfers the transaction quota reservation
+ * to that of a delalloc extent.
+ */
+ ap->ip->i_delayed_blks += args->len;
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
+ -(long)args->len);
+ return;
+ }
+
+ /* data/attr fork only */
+ ap->ip->i_nblocks += args->len;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel) {
+ ap->ip->i_delayed_blks -= args->len;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ }
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
+ args->len);
+}
+
+static int
+xfs_bmap_compute_alignments(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
+ int stripe_align = 0;
+
+ /* stripe alignment for allocation is determined by mount parameters */
+ if (mp->m_swidth && xfs_has_swalloc(mp))
+ stripe_align = mp->m_swidth;
+ else if (mp->m_dalign)
+ stripe_align = mp->m_dalign;
+
+ if (ap->flags & XFS_BMAPI_COWFORK)
+ align = xfs_get_cowextsz_hint(ap->ip);
+ else if (ap->datatype & XFS_ALLOC_USERDATA)
+ align = xfs_get_extsz_hint(ap->ip);
+ if (align) {
+ if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
+ ap->eof, 0, ap->conv, &ap->offset,
+ &ap->length))
+ ASSERT(0);
+ ASSERT(ap->length);
+ }
+
+ /* apply extent size hints if obtained earlier */
+ if (align) {
+ args->prod = align;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
+ args->prod = 1;
+ args->mod = 0;
+ } else {
+ args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ }
+
+ return stripe_align;
+}
+
+static void
+xfs_bmap_process_allocated_extent(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_fileoff_t orig_offset,
+ xfs_extlen_t orig_length)
+{
+ int nullfb;
+
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(nullfb ||
+ XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+ XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+
+ ap->blkno = args->fsbno;
+ if (nullfb)
+ ap->tp->t_firstblock = args->fsbno;
+ ap->length = args->len;
+ /*
+ * If the extent size hint is active, we tried to round the
+ * caller's allocation request offset down to extsz and the
+ * length up to another extsz boundary. If we found a free
+ * extent we mapped it in starting at this new offset. If the
+ * newly mapped space isn't long enough to cover any of the
+ * range of offsets that was originally requested, move the
+ * mapping up so that we can fill as much of the caller's
+ * original request as possible. Free space is apparently
+ * very fragmented so we're unlikely to be able to satisfy the
+ * hints anyway.
+ */
+ if (ap->length <= orig_length)
+ ap->offset = orig_offset;
+ else if (ap->offset + ap->length < orig_offset + orig_length)
+ ap->offset = orig_offset + orig_length - ap->length;
+ xfs_bmap_btalloc_accounting(ap, args);
+}
+
+#ifdef DEBUG
+static int
+xfs_bmap_exact_minlen_extent_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ int error;
+
+ ASSERT(ap->length);
+
+ if (ap->minlen != 1) {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ args.alloc_minlen_only = 1;
+
+ xfs_bmap_compute_alignments(ap, &args);
+
+ if (ap->tp->t_firstblock == NULLFSBLOCK) {
+ /*
+ * Unlike the longest extent available in an AG, we don't track
+ * the length of an AG's shortest extent.
+ * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+ * hence we can afford to start traversing from the 0th AG since
+ * we need not be concerned about a drop in performance in
+ * "debug only" code paths.
+ */
+ ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
+ } else {
+ ap->blkno = ap->tp->t_firstblock;
+ }
+
+ args.fsbno = ap->blkno;
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minlen = args.maxlen = ap->minlen;
+ args.total = ap->total;
+
+ args.alignment = 1;
+ args.minalignslop = 0;
+
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (args.fsbno != NULLFSBLOCK) {
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ }
+
+ return 0;
+}
+#else
+
+#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED)
+
+#endif
+
+STATIC int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_alloctype_t atype = 0;
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ stripe_align = xfs_bmap_compute_alignments(ap, &args);
+
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
+ ap->tp->t_firstblock);
+ if (nullfb) {
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
+ xfs_inode_is_filestream(ap->ip)) {
+ ag = xfs_filestream_lookup_ag(ap->ip);
+ ag = (ag != NULLAGNUMBER) ? ag : 0;
+ ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+ } else {
+ ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+ }
+ } else
+ ap->blkno = ap->tp->t_firstblock;
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * If allowed, use ap->blkno; otherwise must use firstblock since
+ * it's in the right allocation group.
+ */
+ if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+ ;
+ else
+ ap->blkno = ap->tp->t_firstblock;
+ /*
+ * Normal allocation, done through xfs_alloc_vextent.
+ */
+ tryagain = isaligned = 0;
+ args.fsbno = ap->blkno;
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+
+ /* Trim the allocation back to the maximum an AG can fit. */
+ args.maxlen = min(ap->length, mp->m_ag_max_usable);
+ blen = 0;
+ if (nullfb) {
+ /*
+ * Search for an allocation group with a single extent large
+ * enough for the request. If one isn't found, then adjust
+ * the minimum allocation size to the largest space found.
+ */
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
+ xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ else
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ if (error)
+ return error;
+ } else if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+ if (xfs_inode_is_filestream(ap->ip))
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ else
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.total = args.minlen = ap->minlen;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.total = ap->total;
+ args.minlen = ap->minlen;
+ }
+
+ /*
+ * If we are not low on available data blocks, and the underlying
+ * logical volume manager is a stripe, and the file offset is zero then
+ * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof
+ * is only set if the allocation length is >= the stripe unit and the
+ * allocation offset is at the end of file.
+ */
+ if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
+ if (!ap->offset) {
+ args.alignment = stripe_align;
+ atype = args.type;
+ isaligned = 1;
+ /*
+ * Adjust minlen to try and preserve alignment if we
+ * can't guarantee an aligned maxlen extent.
+ */
+ if (blen > args.alignment &&
+ blen <= args.maxlen + args.alignment)
+ args.minlen = blen - args.alignment;
+ args.minalignslop = 0;
+ } else {
+ /*
+ * First try an exact bno allocation.
+ * If it fails then do a near or start bno
+ * allocation with alignment turned on.
+ */
+ atype = args.type;
+ tryagain = 1;
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.alignment = 1;
+ /*
+ * Compute the minlen+alignment for the
+ * next case. Set slop so that the value
+ * of minlen+alignment+slop doesn't go up
+ * between the calls.
+ */
+ if (blen > stripe_align && blen <= args.maxlen)
+ nextminlen = blen - stripe_align;
+ else
+ nextminlen = args.minlen;
+ if (nextminlen + stripe_align > args.minlen + 1)
+ args.minalignslop =
+ nextminlen + stripe_align -
+ args.minlen - 1;
+ else
+ args.minalignslop = 0;
+ }
+ } else {
+ args.alignment = 1;
+ args.minalignslop = 0;
+ }
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (tryagain && args.fsbno == NULLFSBLOCK) {
+ /*
+ * Exact allocation failed. Now try with alignment
+ * turned on.
+ */
+ args.type = atype;
+ args.fsbno = ap->blkno;
+ args.alignment = stripe_align;
+ args.minlen = nextminlen;
+ args.minalignslop = 0;
+ isaligned = 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (isaligned && args.fsbno == NULLFSBLOCK) {
+ /*
+ * allocation failed, so turn off alignment and
+ * try again.
+ */
+ args.type = atype;
+ args.fsbno = ap->blkno;
+ args.alignment = 0;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb &&
+ args.minlen > ap->minlen) {
+ args.minlen = ap->minlen;
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = ap->blkno;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb) {
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.total = ap->minlen;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ ap->tp->t_flags |= XFS_TRANS_LOWMODE;
+ }
+
+ if (args.fsbno != NULLFSBLOCK) {
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ }
+ return 0;
+}
+
+/* Trim extent to fit a logical block range. */
+void
+xfs_trim_extent(
+ struct xfs_bmbt_irec *irec,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len)
+{
+ xfs_fileoff_t distance;
+ xfs_fileoff_t end = bno + len;
+
+ if (irec->br_startoff + irec->br_blockcount <= bno ||
+ irec->br_startoff >= end) {
+ irec->br_blockcount = 0;
+ return;
+ }
+
+ if (irec->br_startoff < bno) {
+ distance = bno - irec->br_startoff;
+ if (isnullstartblock(irec->br_startblock))
+ irec->br_startblock = DELAYSTARTBLOCK;
+ if (irec->br_startblock != DELAYSTARTBLOCK &&
+ irec->br_startblock != HOLESTARTBLOCK)
+ irec->br_startblock += distance;
+ irec->br_startoff += distance;
+ irec->br_blockcount -= distance;
+ }
+
+ if (end < irec->br_startoff + irec->br_blockcount) {
+ distance = irec->br_startoff + irec->br_blockcount - end;
+ irec->br_blockcount -= distance;
+ }
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+ struct xfs_bmbt_irec *mval,
+ struct xfs_bmbt_irec *got,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int n,
+ uint32_t flags)
+{
+ if ((flags & XFS_BMAPI_ENTIRE) ||
+ got->br_startoff + got->br_blockcount <= obno) {
+ *mval = *got;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ return;
+ }
+
+ if (obno > *bno)
+ *bno = obno;
+ ASSERT((*bno >= obno) || (n == 0));
+ ASSERT(*bno < end);
+ mval->br_startoff = *bno;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ else
+ mval->br_startblock = got->br_startblock +
+ (*bno - got->br_startoff);
+ /*
+ * Return the minimum of what we got and what we asked for for
+ * the length. We can use the len variable here because it is
+ * modified below and we could have been there before coming
+ * here if the first part of the allocation didn't overlap what
+ * was asked for.
+ */
+ mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+ got->br_blockcount - (*bno - got->br_startoff));
+ mval->br_state = got->br_state;
+ ASSERT(mval->br_blockcount <= len);
+ return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+ struct xfs_bmbt_irec **map,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t *len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int *n,
+ uint32_t flags)
+{
+ xfs_bmbt_irec_t *mval = *map;
+
+ ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+ ((mval->br_startoff + mval->br_blockcount) <= end));
+ ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+ (mval->br_startoff < obno));
+
+ *bno = mval->br_startoff + mval->br_blockcount;
+ *len = end - *bno;
+ if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+ /* update previous map with new information */
+ ASSERT(mval->br_startblock == mval[-1].br_startblock);
+ ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+ ASSERT(mval->br_state == mval[-1].br_state);
+ mval[-1].br_blockcount = mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != HOLESTARTBLOCK &&
+ mval->br_startblock == mval[-1].br_startblock +
+ mval[-1].br_blockcount &&
+ mval[-1].br_state == mval->br_state) {
+ ASSERT(mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount);
+ mval[-1].br_blockcount += mval->br_blockcount;
+ } else if (*n > 0 &&
+ mval->br_startblock == DELAYSTARTBLOCK &&
+ mval[-1].br_startblock == DELAYSTARTBLOCK &&
+ mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount) {
+ mval[-1].br_blockcount += mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (!((*n == 0) &&
+ ((mval->br_startoff + mval->br_blockcount) <=
+ obno))) {
+ mval++;
+ (*n)++;
+ }
+ *map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ struct xfs_bmbt_irec *mval,
+ int *nmap,
+ uint32_t flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ xfs_fileoff_t obno;
+ xfs_fileoff_t end;
+ struct xfs_iext_cursor icur;
+ int error;
+ bool eof = false;
+ int n = 0;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+ if (WARN_ON_ONCE(!ifp))
+ return -EFSCORRUPTED;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+ return -EFSCORRUPTED;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ XFS_STATS_INC(mp, xs_blk_mapr);
+
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ return error;
+
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got))
+ eof = true;
+ end = bno + len;
+ obno = bno;
+
+ while (bno < end && n < *nmap) {
+ /* Reading past eof, act as though there's a hole up to end. */
+ if (eof)
+ got.br_startoff = end;
+ if (got.br_startoff > bno) {
+ /* Reading in a hole. */
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount =
+ XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+ mval->br_state = XFS_EXT_NORM;
+ bno += mval->br_blockcount;
+ len -= mval->br_blockcount;
+ mval++;
+ n++;
+ continue;
+ }
+
+ /* set up the extent map to return. */
+ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /* If we're done, stop now. */
+ if (bno >= end || n >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ if (!xfs_iext_next_extent(ifp, &icur, &got))
+ eof = true;
+ }
+ *nmap = n;
+ return 0;
+}
+
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
+int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *icur,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ int error;
+ xfs_fileoff_t aoff = off;
+
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
+
+ /* Figure out the extent size, adjust alen */
+ if (whichfork == XFS_COW_FORK) {
+ struct xfs_bmbt_irec prev;
+ xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
+
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_quota_reserve_blkres(ip, alen);
+ if (error)
+ return error;
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
+ if (error)
+ goto out_unreserve_quota;
+
+ error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
+ if (error)
+ goto out_unreserve_blocks;
+
+
+ ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip->i_mount, alen + indlen);
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
+
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
+ return 0;
+
+out_unreserve_blocks:
+ xfs_mod_fdblocks(mp, alen, false);
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_quota_unreserve_blkres(ip, alen);
+ return error;
+}
+
+static int
+xfs_bmap_alloc_userdata(
+ struct xfs_bmalloca *bma)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
+ int error;
+
+ /*
+ * Set the data type being allocated. For the data fork, the first data
+ * in the file is treated differently to all other allocations. For the
+ * attribute fork, we only need to ensure the allocated range is not on
+ * the busy list.
+ */
+ bma->datatype = XFS_ALLOC_NOBUSY;
+ if (whichfork == XFS_DATA_FORK) {
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+
+ if (mp->m_dalign && bma->length >= mp->m_dalign) {
+ error = xfs_bmap_isaeof(bma, whichfork);
+ if (error)
+ return error;
+ }
+
+ if (XFS_IS_REALTIME_INODE(bma->ip))
+ return xfs_bmap_rtalloc(bma);
+ }
+
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ return xfs_bmap_exact_minlen_extent_alloc(bma);
+
+ return xfs_bmap_btalloc(bma);
+}
+
+static int
+xfs_bmapi_allocate(
+ struct xfs_bmalloca *bma)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+
+ ASSERT(bma->length > 0);
+
+ /*
+ * For the wasdelay case, we could also just allocate the stuff asked
+ * for in this bmap call but that wouldn't be as good.
+ */
+ if (bma->wasdel) {
+ bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+ bma->offset = bma->got.br_startoff;
+ if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
+ bma->prev.br_startoff = NULLFILEOFF;
+ } else {
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
+ if (!bma->eof)
+ bma->length = XFS_FILBLKS_MIN(bma->length,
+ bma->got.br_startoff - bma->offset);
+ }
+
+ if (bma->flags & XFS_BMAPI_CONTIG)
+ bma->minlen = bma->length;
+ else
+ bma->minlen = 1;
+
+ if (bma->flags & XFS_BMAPI_METADATA) {
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ error = xfs_bmap_exact_minlen_extent_alloc(bma);
+ else
+ error = xfs_bmap_btalloc(bma);
+ } else {
+ error = xfs_bmap_alloc_userdata(bma);
+ }
+ if (error || bma->blkno == NULLFSBLOCK)
+ return error;
+
+ if (bma->flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
+ if (error)
+ return error;
+ }
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur)
+ bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+ /*
+ * Bump the number of extents we've allocated
+ * in this call.
+ */
+ bma->nallocs++;
+
+ if (bma->cur)
+ bma->cur->bc_ino.flags =
+ bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+
+ bma->got.br_startoff = bma->offset;
+ bma->got.br_startblock = bma->blkno;
+ bma->got.br_blockcount = bma->length;
+ bma->got.br_state = XFS_EXT_NORM;
+
+ if (bma->flags & XFS_BMAPI_PREALLOC)
+ bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+ if (bma->wasdel)
+ error = xfs_bmap_add_extent_delay_real(bma, whichfork);
+ else
+ error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+ whichfork, &bma->icur, &bma->cur, &bma->got,
+ &bma->logflags, bma->flags);
+
+ bma->logflags |= tmp_logflags;
+ if (error)
+ return error;
+
+ /*
+ * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+ * or xfs_bmap_add_extent_hole_real might have merged it into one of
+ * the neighbouring ones.
+ */
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
+
+ ASSERT(bma->got.br_startoff <= bma->offset);
+ ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+ bma->offset + bma->length);
+ ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+ bma->got.br_state == XFS_EXT_UNWRITTEN);
+ return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+ struct xfs_bmalloca *bma,
+ struct xfs_bmbt_irec *mval,
+ xfs_filblks_t len,
+ uint32_t flags)
+{
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+
+ /* check if we need to do unwritten->real conversion */
+ if (mval->br_state == XFS_EXT_UNWRITTEN &&
+ (flags & XFS_BMAPI_PREALLOC))
+ return 0;
+
+ /* check if we need to do real->unwritten conversion */
+ if (mval->br_state == XFS_EXT_NORM &&
+ (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+ return 0;
+
+ /*
+ * Modify (by adding) the state flag, if writing.
+ */
+ ASSERT(mval->br_blockcount <= len);
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) {
+ bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+ bma->ip, whichfork);
+ }
+ mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+ ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
+ error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
+ &bma->icur, &bma->cur, mval, &tmp_logflags);
+ /*
+ * Log the inode core unconditionally in the unwritten extent conversion
+ * path because the conversion might not have done so (e.g., if the
+ * extent count hasn't changed). We need to make sure the inode is dirty
+ * in the transaction for the sake of fsync(), even if nothing has
+ * changed, because fsync() will not force the log for this transaction
+ * unless it sees the inode pinned.
+ *
+ * Note: If we're only converting cow fork extents, there aren't
+ * any on-disk updates to make, so we don't need to log anything.
+ */
+ if (whichfork != XFS_COW_FORK)
+ bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+ if (error)
+ return error;
+
+ /*
+ * Update our extent pointer, given that
+ * xfs_bmap_add_extent_unwritten_real might have merged it into one
+ * of the neighbouring ones.
+ */
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
+
+ /*
+ * We may have combined previously unwritten space with written space,
+ * so generate another request.
+ */
+ if (mval->br_blockcount < len)
+ return -EAGAIN;
+ return 0;
+}
+
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork);
+
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(ifp->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
+
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary. Details behaviour is controlled by the flags
+ * parameter. Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ */
+int
+xfs_bmapi_write(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ uint32_t flags, /* XFS_BMAPI_... */
+ xfs_extlen_t total, /* total blocks needed */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap) /* i/o: mval size/count */
+{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
+ struct xfs_mount *mp = ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ xfs_fileoff_t end; /* end of mapped file region */
+ bool eof = false; /* after the end of extents */
+ int error; /* error return */
+ int n; /* current extent index */
+ xfs_fileoff_t obno; /* old block number (offset) */
+
+#ifdef DEBUG
+ xfs_fileoff_t orig_bno; /* original block number value */
+ int orig_flags; /* original flags arg value */
+ xfs_filblks_t orig_len; /* original value of len arg */
+ struct xfs_bmbt_irec *orig_mval; /* original value of mval */
+ int orig_nmap; /* original value of *nmap */
+
+ orig_bno = bno;
+ orig_len = len;
+ orig_flags = flags;
+ orig_mval = mval;
+ orig_nmap = *nmap;
+#endif
+
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(tp != NULL);
+ ASSERT(len > 0);
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & XFS_BMAPI_REMAP));
+
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ goto error0;
+
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
+ eof = true;
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+ n = 0;
+ end = bno + len;
+ obno = bno;
+ while (bno < end && n < *nmap) {
+ bool need_alloc = false, wasdelay = false;
+
+ /* in hole or beyond EOF? */
+ if (eof || bma.got.br_startoff > bno) {
+ /*
+ * CoW fork conversions should /never/ hit EOF or
+ * holes. There should always be something for us
+ * to work on.
+ */
+ ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
+ (flags & XFS_BMAPI_COWFORK)));
+
+ need_alloc = true;
+ } else if (isnullstartblock(bma.got.br_startblock)) {
+ wasdelay = true;
+ }
+
+ /*
+ * First, deal with the hole before the allocated space
+ * that we found, if any.
+ */
+ if (need_alloc || wasdelay) {
+ bma.eof = eof;
+ bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+ bma.wasdel = wasdelay;
+ bma.offset = bno;
+ bma.flags = flags;
+
+ /*
+ * There's a 32/64 bit type mismatch between the
+ * allocation length request (which can be 64 bits in
+ * length) and the bma length request, which is
+ * xfs_extlen_t and therefore 32 bits. Hence we have to
+ * check for 32-bit overflows and handle them here.
+ */
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
+ else
+ bma.length = len;
+
+ ASSERT(len > 0);
+ ASSERT(bma.length > 0);
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto error0;
+ if (bma.blkno == NULLFSBLOCK)
+ break;
+
+ /*
+ * If this is a CoW allocation, record the data in
+ * the refcount btree for orphan recovery.
+ */
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ }
+
+ /* Deal with the allocated space we found. */
+ xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+ end, n, flags);
+
+ /* Execute unwritten extent conversion if necessary */
+ error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+ if (error == -EAGAIN)
+ continue;
+ if (error)
+ goto error0;
+
+ /* update the extent map to return */
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /*
+ * If we're done, stop now. Stop when we've allocated
+ * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
+ * the transaction may get too big.
+ */
+ if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ bma.prev = bma.got;
+ if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
+ eof = true;
+ }
+ *nmap = n;
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
+
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE ||
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
+error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_off_t offset,
+ struct iomap *iomap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ struct xfs_bmalloca bma = { NULL };
+ uint16_t flags = 0;
+ struct xfs_trans *tp;
+ int error;
+
+ if (whichfork == XFS_COW_FORK)
+ flags |= IOMAP_F_SHARED;
+
+ /*
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_iext_count_may_overflow(ip, whichfork,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
+ /*
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
+ */
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+ /*
+ * When we're converting the delalloc reservations backing dirty pages
+ * in the page cache, we must be careful about how we create the new
+ * extents:
+ *
+ * New CoW fork extents are created unwritten, turned into real extents
+ * when we're about to write the data to disk, and mapped into the data
+ * fork after the write finishes. End of story.
+ *
+ * New data fork extents must be mapped in as unwritten and converted
+ * to real extents after the write succeeds to avoid exposing stale
+ * disk contents if we crash.
+ */
+ bma.flags = XFS_BMAPI_PREALLOC;
+ if (whichfork == XFS_COW_FORK)
+ bma.flags |= XFS_BMAPI_COWFORK;
+
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+int
+xfs_bmapi_remap(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ xfs_fsblock_t startblock,
+ uint32_t flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ int logflags = 0, error;
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(len > 0);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_NORMAP)));
+ ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
+ (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
+ /* make sure we only reflink into a hole. */
+ ASSERT(got.br_startoff > bno);
+ ASSERT(got.br_startoff - bno >= len);
+ }
+
+ ip->i_nblocks += len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = 0;
+ }
+
+ got.br_startoff = bno;
+ got.br_startblock = startblock;
+ got.br_blockcount = len;
+ if (flags & XFS_BMAPI_PREALLOC)
+ got.br_state = XFS_EXT_UNWRITTEN;
+ else
+ got.br_state = XFS_EXT_NORM;
+
+ error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
+ &cur, &got, &logflags, flags);
+ if (error)
+ goto error0;
+
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
+
+error0:
+ if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~XFS_ILOG_DEXT;
+ else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE)
+ logflags &= ~XFS_ILOG_DBROOT;
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur)
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
+ */
+static xfs_filblks_t
+xfs_bmap_split_indlen(
+ xfs_filblks_t ores, /* original res. */
+ xfs_filblks_t *indlen1, /* ext1 worst indlen */
+ xfs_filblks_t *indlen2, /* ext2 worst indlen */
+ xfs_filblks_t avail) /* stealable blocks */
+{
+ xfs_filblks_t len1 = *indlen1;
+ xfs_filblks_t len2 = *indlen2;
+ xfs_filblks_t nres = len1 + len2; /* new total res. */
+ xfs_filblks_t stolen = 0;
+ xfs_filblks_t resfactor;
+
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst case
+ * indlen for both new extents.
+ */
+ if (ores < nres && avail)
+ stolen = XFS_FILBLKS_MIN(nres - ores, avail);
+ ores += stolen;
+
+ /* nothing else to do if we've satisfied the new reservation */
+ if (ores >= nres)
+ return stolen;
+
+ /*
+ * We can't meet the total required reservation for the two extents.
+ * Calculate the percent of the overall shortage between both extents
+ * and apply this percentage to each of the requested indlen values.
+ * This distributes the shortage fairly and reduces the chances that one
+ * of the two extents is left with nothing when extents are repeatedly
+ * split.
+ */
+ resfactor = (ores * 100);
+ do_div(resfactor, nres);
+ len1 *= resfactor;
+ do_div(len1, 100);
+ len2 *= resfactor;
+ do_div(len2, 100);
+ ASSERT(len1 + len2 <= ores);
+ ASSERT(len1 < *indlen1 && len2 < *indlen2);
+
+ /*
+ * Hand out the remainder to each extent. If one of the two reservations
+ * is zero, we want to make sure that one gets a block first. The loop
+ * below starts with len1, so hand len2 a block right off the bat if it
+ * is zero.
+ */
+ ores -= (len1 + len2);
+ ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
+ if (ores && !len2 && *indlen2) {
+ len2++;
+ ores--;
+ }
+ while (ores) {
+ if (len1 < *indlen1) {
+ len1++;
+ ores--;
+ }
+ if (!ores)
+ break;
+ if (len2 < *indlen2) {
+ len2++;
+ ores--;
+ }
+ }
+
+ *indlen1 = len1;
+ *indlen2 = len2;
+
+ return stolen;
+}
+
+int
+xfs_bmap_del_extent_delay(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec new;
+ int64_t da_old, da_new, da_diff = 0;
+ xfs_fileoff_t del_endoff, got_endoff;
+ xfs_filblks_t got_indlen, new_indlen, stolen;
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ int error = 0;
+ bool isrt;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+ da_old = startblockval(got->br_startblock);
+ da_new = 0;
+
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+
+ if (isrt) {
+ uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+
+ do_div(rtexts, mp->m_sb.sb_rextsize);
+ xfs_mod_frextents(mp, rtexts);
+ }
+
+ /*
+ * Update the inode delalloc counter now and wait to update the
+ * sb counters as we might have to borrow some blocks for the
+ * indirect block accounting.
+ */
+ ASSERT(!isrt);
+ error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
+ if (error)
+ return error;
+ ip->i_delayed_blks -= del->br_blockcount;
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ break;
+ case BMAP_LEFT_FILLING:
+ /*
+ * Deleting the first part of the extent.
+ */
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_iext_update_extent(ip, state, icur, got);
+ break;
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Deleting the last part of the extent.
+ */
+ got->br_blockcount = got->br_blockcount - del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_iext_update_extent(ip, state, icur, got);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ *
+ * Distribute the original indlen reservation across the two new
+ * extents. Steal blocks from the deleted extent if necessary.
+ * Stealing blocks simply fudges the fdblocks accounting below.
+ * Warn if either of the new indlen reservations is zero as this
+ * can lead to delalloc problems.
+ */
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+ got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
+
+ new.br_blockcount = got_endoff - del_endoff;
+ new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+
+ WARN_ON_ONCE(!got_indlen || !new_indlen);
+ stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
+ del->br_blockcount);
+
+ got->br_startblock = nullstartblock((int)got_indlen);
+
+ new.br_startoff = del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = nullstartblock((int)new_indlen);
+
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
+
+ da_new = got_indlen + new_indlen - stolen;
+ del->br_blockcount -= stolen;
+ break;
+ }
+
+ ASSERT(da_old >= da_new);
+ da_diff = da_old - da_new;
+ if (!isrt)
+ da_diff += del->br_blockcount;
+ if (da_diff) {
+ xfs_mod_fdblocks(mp, da_diff, false);
+ xfs_mod_delalloc(mp, -da_diff);
+ }
+ return error;
+}
+
+void
+xfs_bmap_del_extent_cow(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec new;
+ xfs_fileoff_t del_endoff, got_endoff;
+ uint32_t state = BMAP_COWFORK;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ break;
+ case BMAP_LEFT_FILLING:
+ /*
+ * Deleting the first part of the extent.
+ */
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ got->br_startblock = del->br_startblock + del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, got);
+ break;
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Deleting the last part of the extent.
+ */
+ got->br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, got);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+
+ new.br_startoff = del_endoff;
+ new.br_blockcount = got_endoff - del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = del->br_startblock + del->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
+ break;
+ }
+ ip->i_delayed_blks -= del->br_blockcount;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space.
+ */
+STATIC int /* error */
+xfs_bmap_del_extent_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_trans_t *tp, /* current transaction pointer */
+ struct xfs_iext_cursor *icur,
+ struct xfs_btree_cur *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *del, /* data to remove from extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork, /* data or attr fork */
+ uint32_t bflags) /* bmapi flags */
+{
+ xfs_fsblock_t del_endblock=0; /* first block past del */
+ xfs_fileoff_t del_endoff; /* first offset past del */
+ int do_fx; /* free extent at end of routine */
+ int error; /* error return value */
+ int flags = 0;/* inode logging flags */
+ struct xfs_bmbt_irec got; /* current extent entry */
+ xfs_fileoff_t got_endoff; /* first offset past got */
+ int i; /* temp state */
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t nblks; /* quota/sb block count */
+ xfs_bmbt_irec_t new; /* new record to be inserted */
+ /* REFERENCED */
+ uint qfield; /* quota field to update */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
+
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(del->br_blockcount > 0);
+ xfs_iext_get_extent(ifp, icur, &got);
+ ASSERT(got.br_startoff <= del->br_startoff);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got.br_startoff + got.br_blockcount;
+ ASSERT(got_endoff >= del_endoff);
+ ASSERT(!isnullstartblock(got.br_startblock));
+ qfield = 0;
+ error = 0;
+
+ /*
+ * If it's the case where the directory code is running with no block
+ * reservation, and the deleted block is in the middle of its extent,
+ * and the resulting insert of an extent would cause transformation to
+ * btree format, then reject it. The calling code will then swap blocks
+ * around instead. We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction will be dirty then.
+ */
+ if (tp->t_blk_res == 0 &&
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) &&
+ del->br_startoff > got.br_startoff && del_endoff < got_endoff)
+ return -ENOSPC;
+
+ flags = XFS_ILOG_CORE;
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_filblks_t len;
+ xfs_extlen_t mod;
+
+ len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
+ &mod);
+ ASSERT(mod == 0);
+
+ if (!(bflags & XFS_BMAPI_REMAP)) {
+ xfs_fsblock_t bno;
+
+ bno = div_u64_rem(del->br_startblock,
+ mp->m_sb.sb_rextsize, &mod);
+ ASSERT(mod == 0);
+
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ }
+
+ do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ } else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
+ }
+
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ }
+
+ if (got.br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ ifp->if_nextents--;
+
+ flags |= XFS_ILOG_CORE;
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ break;
+ case BMAP_LEFT_FILLING:
+ /*
+ * Deleting the first part of the extent.
+ */
+ got.br_startoff = del_endoff;
+ got.br_startblock = del_endblock;
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto done;
+ break;
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Deleting the last part of the extent.
+ */
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto done;
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+
+ old = got;
+
+ got.br_blockcount = del->br_startoff - got.br_startoff;
+ xfs_iext_update_extent(ip, state, icur, &got);
+
+ new.br_startoff = del_endoff;
+ new.br_blockcount = got_endoff - del_endoff;
+ new.br_state = got.br_state;
+ new.br_startblock = del_endblock;
+
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != -ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert, it tried a
+ * split, and we have a zero block reservation. Fix up
+ * our state and return the error.
+ */
+ if (error == -ENOSPC) {
+ /*
+ * Reset the cursor, don't trust it after any
+ * insert operation.
+ */
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ error = xfs_bmbt_update(cur, &old);
+ if (error)
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_iext_update_extent(ip, state, icur, &old);
+ flags = 0;
+ error = -ENOSPC;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+
+ ifp->if_nextents++;
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
+ break;
+ }
+
+ /* remove reverse mapping */
+ xfs_rmap_unmap_extent(tp, ip, whichfork, del);
+
+ /*
+ * If we need to, add to list of extents to delete.
+ */
+ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+ xfs_refcount_decrease_extent(tp, del);
+ } else {
+ __xfs_free_extent_later(tp, del->br_startblock,
+ del->br_blockcount, NULL,
+ (bflags & XFS_BMAPI_NODISCARD) ||
+ del->br_state == XFS_EXT_UNWRITTEN);
+ }
+ }
+
+ /*
+ * Adjust inode # blocks in the file.
+ */
+ if (nblks)
+ ip->i_nblocks -= nblks;
+ /*
+ * Adjust quota data.
+ */
+ if (qfield && !(bflags & XFS_BMAPI_REMAP))
+ xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value. If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int /* error */
+__xfs_bunmapi(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t start, /* first file offset deleted */
+ xfs_filblks_t *rlen, /* i/o: amount remaining */
+ uint32_t flags, /* misc flags */
+ xfs_extnum_t nexts) /* number of extents max */
+{
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
+ struct xfs_bmbt_irec del; /* extent being deleted */
+ int error; /* error return value */
+ xfs_extnum_t extno; /* extent number in list */
+ struct xfs_bmbt_irec got; /* current extent record */
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ int isrt; /* freeing in rt area */
+ int logflags; /* transaction logging flags */
+ xfs_extlen_t mod; /* rt extent offset */
+ struct xfs_mount *mp = ip->i_mount;
+ int tmp_logflags; /* partial logging flags */
+ int wasdel; /* was a delayed alloc extent */
+ int whichfork; /* data or attribute fork */
+ xfs_fsblock_t sum;
+ xfs_filblks_t len = *rlen; /* length to unmap in file */
+ xfs_fileoff_t end;
+ struct xfs_iext_cursor icur;
+ bool done = false;
+
+ trace_xfs_bunmap(ip, start, len, flags, _RET_IP_);
+
+ whichfork = xfs_bmapi_whichfork(flags);
+ ASSERT(whichfork != XFS_COW_FORK);
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+ return -EFSCORRUPTED;
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(len > 0);
+ ASSERT(nexts >= 0);
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ if (xfs_iext_count(ifp) == 0) {
+ *rlen = 0;
+ return 0;
+ }
+ XFS_STATS_INC(mp, xs_blk_unmap);
+ isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ end = start + len;
+
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
+ *rlen = 0;
+ return 0;
+ }
+ end--;
+
+ logflags = 0;
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = 0;
+ } else
+ cur = NULL;
+
+ if (isrt) {
+ /*
+ * Synchronize by locking the bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ }
+
+ extno = 0;
+ while (end != (xfs_fileoff_t)-1 && end >= start &&
+ (nexts == 0 || extno < nexts)) {
+ /*
+ * Is the found extent after a hole in which end lives?
+ * Just back up to the previous extent, if so.
+ */
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
+ }
+ /*
+ * Is the last block of this extent before the range
+ * we're supposed to delete? If so, we're done.
+ */
+ end = XFS_FILEOFF_MIN(end,
+ got.br_startoff + got.br_blockcount - 1);
+ if (end < start)
+ break;
+ /*
+ * Then deal with the (possibly delayed) allocated space
+ * we found.
+ */
+ del = got;
+ wasdel = isnullstartblock(del.br_startblock);
+
+ if (got.br_startoff < start) {
+ del.br_startoff = start;
+ del.br_blockcount -= start - got.br_startoff;
+ if (!wasdel)
+ del.br_startblock += start - got.br_startoff;
+ }
+ if (del.br_startoff + del.br_blockcount > end + 1)
+ del.br_blockcount = end + 1 - del.br_startoff;
+
+ if (!isrt)
+ goto delete;
+
+ sum = del.br_startblock + del.br_blockcount;
+ div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+ if (mod) {
+ /*
+ * Realtime extent not lined up at the end.
+ * The extent could have been split into written
+ * and unwritten pieces, or we could just be
+ * unmapping part of it. But we can't really
+ * get rid of part of a realtime extent.
+ */
+ if (del.br_state == XFS_EXT_UNWRITTEN) {
+ /*
+ * This piece is unwritten, or we're not
+ * using unwritten extents. Skip over it.
+ */
+ ASSERT(end >= mod);
+ end -= mod > del.br_blockcount ?
+ del.br_blockcount : mod;
+ if (end < got.br_startoff &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
+ }
+ continue;
+ }
+ /*
+ * It's written, turn it unwritten.
+ * This is better than zeroing it.
+ */
+ ASSERT(del.br_state == XFS_EXT_NORM);
+ ASSERT(tp->t_blk_res > 0);
+ /*
+ * If this spans a realtime extent boundary,
+ * chop it back to the start of the one we end at.
+ */
+ if (del.br_blockcount > mod) {
+ del.br_startoff += del.br_blockcount - mod;
+ del.br_startblock += del.br_blockcount - mod;
+ del.br_blockcount = mod;
+ }
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+ whichfork, &icur, &cur, &del,
+ &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
+ }
+ div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+ if (mod) {
+ xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
+
+ /*
+ * Realtime extent is lined up at the end but not
+ * at the front. We'll get rid of full extents if
+ * we can.
+ */
+ if (del.br_blockcount > off) {
+ del.br_blockcount -= off;
+ del.br_startoff += off;
+ del.br_startblock += off;
+ } else if (del.br_startoff == start &&
+ (del.br_state == XFS_EXT_UNWRITTEN ||
+ tp->t_blk_res == 0)) {
+ /*
+ * Can't make it unwritten. There isn't
+ * a full extent here so just skip it.
+ */
+ ASSERT(end >= del.br_blockcount);
+ end -= del.br_blockcount;
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
+ }
+ continue;
+ } else if (del.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec prev;
+ xfs_fileoff_t unwrite_start;
+
+ /*
+ * This one is already unwritten.
+ * It must have a written left neighbor.
+ * Unwrite the killed part of that one and
+ * try again.
+ */
+ if (!xfs_iext_prev_extent(ifp, &icur, &prev))
+ ASSERT(0);
+ ASSERT(prev.br_state == XFS_EXT_NORM);
+ ASSERT(!isnullstartblock(prev.br_startblock));
+ ASSERT(del.br_startblock ==
+ prev.br_startblock + prev.br_blockcount);
+ unwrite_start = max3(start,
+ del.br_startoff - mod,
+ prev.br_startoff);
+ mod = unwrite_start - prev.br_startoff;
+ prev.br_startoff = unwrite_start;
+ prev.br_startblock += mod;
+ prev.br_blockcount -= mod;
+ prev.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, whichfork, &icur, &cur,
+ &prev, &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
+ } else {
+ ASSERT(del.br_state == XFS_EXT_NORM);
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, whichfork, &icur, &cur,
+ &del, &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
+ }
+ }
+
+delete:
+ if (wasdel) {
+ error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
+ &got, &del);
+ } else {
+ error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
+ &del, &tmp_logflags, whichfork,
+ flags);
+ logflags |= tmp_logflags;
+ }
+
+ if (error)
+ goto error0;
+
+ end = del.br_startoff - 1;
+nodelete:
+ /*
+ * If not done go on to the next (previous) record.
+ */
+ if (end != (xfs_fileoff_t)-1 && end >= start) {
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got))) {
+ done = true;
+ break;
+ }
+ extno++;
+ }
+ }
+ if (done || end == (xfs_fileoff_t)-1 || end < start)
+ *rlen = 0;
+ else
+ *rlen = end - start + 1;
+
+ /*
+ * Convert to a btree if necessary.
+ */
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+ &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
+ whichfork);
+ }
+
+error0:
+ /*
+ * Log everything. Do this after conversion, there's no point in
+ * logging the extent records if we've converted to btree format.
+ */
+ if ((logflags & xfs_ilog_fext(whichfork)) &&
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
+ logflags &= ~xfs_ilog_fbroot(whichfork);
+ /*
+ * Log inode even in the error case, if the transaction
+ * is dirty we'll need to shut down the filesystem.
+ */
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur) {
+ if (!error)
+ cur->bc_ino.allocated = 0;
+ xfs_btree_del_cursor(cur, error);
+ }
+ return error;
+}
+
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+ xfs_trans_t *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ uint32_t flags,
+ xfs_extnum_t nexts,
+ int *done)
+{
+ int error;
+
+ error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts);
+ *done = (len == 0);
+ return error;
+}
+
+/*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+ struct xfs_bmbt_irec *left, /* preceding extent */
+ struct xfs_bmbt_irec *got, /* current extent to shift */
+ xfs_fileoff_t shift) /* shift fsb */
+{
+ xfs_fileoff_t startoff;
+
+ startoff = got->br_startoff - shift;
+
+ /*
+ * The extent, once shifted, must be adjacent in-file and on-disk with
+ * the preceding extent.
+ */
+ if ((left->br_startoff + left->br_blockcount != startoff) ||
+ (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+ (left->br_state != got->br_state) ||
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
+ return false;
+
+ return true;
+}
+
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t shift, /* shift fsb */
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got, /* extent to shift */
+ struct xfs_bmbt_irec *left, /* preceding extent */
+ struct xfs_btree_cur *cur,
+ int *logflags) /* output */
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec new;
+ xfs_filblks_t blockcount;
+ int error, i;
+ struct xfs_mount *mp = ip->i_mount;
+
+ blockcount = left->br_blockcount + got->br_blockcount;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_bmse_can_merge(left, got, shift));
+
+ new = *left;
+ new.br_blockcount = blockcount;
+
+ /*
+ * Update the on-disk extent count, the btree if necessary and log the
+ * inode.
+ */
+ ifp->if_nextents--;
+ *logflags |= XFS_ILOG_CORE;
+ if (!cur) {
+ *logflags |= XFS_ILOG_DEXT;
+ goto done;
+ }
+
+ /* lookup and remove the extent to merge */
+ error = xfs_bmbt_lookup_eq(cur, got, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+
+ /* lookup and update size of the previous extent */
+ error = xfs_bmbt_lookup_eq(cur, left, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+
+ error = xfs_bmbt_update(cur, &new);
+ if (error)
+ return error;
+
+ /* change to extent format if required after extent removal */
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+ if (error)
+ return error;
+
+done:
+ xfs_iext_remove(ip, icur, 0);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ &new);
+
+ /* update reverse mapping. rmap functions merge the rmaps for us */
+ xfs_rmap_unmap_extent(tp, ip, whichfork, got);
+ memcpy(&new, got, sizeof(new));
+ new.br_startoff = left->br_startoff + left->br_blockcount;
+ xfs_rmap_map_extent(tp, ip, whichfork, &new);
+ return 0;
+}
+
+static int
+xfs_bmap_shift_update_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got,
+ struct xfs_btree_cur *cur,
+ int *logflags,
+ xfs_fileoff_t startoff)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec prev = *got;
+ int error, i;
+
+ *logflags |= XFS_ILOG_CORE;
+
+ got->br_startoff = startoff;
+
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, &prev, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
+
+ error = xfs_bmbt_update(cur, got);
+ if (error)
+ return error;
+ } else {
+ *logflags |= XFS_ILOG_DEXT;
+ }
+
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ got);
+
+ /* update reverse mapping */
+ xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+ xfs_rmap_map_extent(tp, ip, whichfork, got);
+ return 0;
+}
+
+int
+xfs_bmap_collapse_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ bool *done)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, prev;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = 0;
+ }
+
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
+ goto del_cursor;
+ }
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+
+ new_startoff = got.br_startoff - offset_shift_fsb;
+ if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
+ if (new_startoff < prev.br_startoff + prev.br_blockcount) {
+ error = -EINVAL;
+ goto del_cursor;
+ }
+
+ if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+ error = xfs_bmse_merge(tp, ip, whichfork,
+ offset_shift_fsb, &icur, &got, &prev,
+ cur, &logflags);
+ if (error)
+ goto del_cursor;
+ goto done;
+ }
+ } else {
+ if (got.br_startoff < offset_shift_fsb) {
+ error = -EINVAL;
+ goto del_cursor;
+ }
+ }
+
+ error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got,
+ cur, &logflags, new_startoff);
+ if (error)
+ goto del_cursor;
+
+done:
+ if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+ *done = true;
+ goto del_cursor;
+ }
+
+ *next_fsb = got.br_startoff;
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur, error);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+/* Make sure we won't be right-shifting an extent past the maximum bound. */
+int
+xfs_bmap_can_insert_extents(
+ struct xfs_inode *ip,
+ xfs_fileoff_t off,
+ xfs_fileoff_t shift)
+{
+ struct xfs_bmbt_irec got;
+ int is_empty;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty);
+ if (!error && !is_empty && got.br_startoff >= off &&
+ ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff)
+ error = -EINVAL;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+int
+xfs_bmap_insert_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ bool *done,
+ xfs_fileoff_t stop_fsb)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, next;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = 0;
+ }
+
+ if (*next_fsb == NULLFSBLOCK) {
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ stop_fsb > got.br_startoff) {
+ *done = true;
+ goto del_cursor;
+ }
+ } else {
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
+ goto del_cursor;
+ }
+ }
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+
+ if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+
+ new_startoff = got.br_startoff + offset_shift_fsb;
+ if (xfs_iext_peek_next_extent(ifp, &icur, &next)) {
+ if (new_startoff + got.br_blockcount > next.br_startoff) {
+ error = -EINVAL;
+ goto del_cursor;
+ }
+
+ /*
+ * Unlike a left shift (which involves a hole punch), a right
+ * shift does not modify extent neighbors in any way. We should
+ * never find mergeable extents in this scenario. Check anyways
+ * and warn if we encounter two extents that could be one.
+ */
+ if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+ WARN_ON_ONCE(1);
+ }
+
+ error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got,
+ cur, &logflags, new_startoff);
+ if (error)
+ goto del_cursor;
+
+ if (!xfs_iext_prev_extent(ifp, &icur, &got) ||
+ stop_fsb >= got.br_startoff + got.br_blockcount) {
+ *done = true;
+ goto del_cursor;
+ }
+
+ *next_fsb = got.br_startoff;
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur, error);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+/*
+ * Splits an extent into two extents at split_fsb block such that it is the
+ * first block of the current_ext. @ext is a target extent to be split.
+ * @split_fsb is a block where the extents is split. If split_fsb lies in a
+ * hole or the first block of extents, just return 0.
+ */
+int
+xfs_bmap_split_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t split_fsb)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec new; /* split extent */
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsblock_t gotblkcnt; /* new block count for got */
+ struct xfs_iext_cursor icur;
+ int error = 0;
+ int logflags = 0;
+ int i = 0;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+
+ /*
+ * If there are not extents, or split_fsb lies in a hole we are done.
+ */
+ if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) ||
+ got.br_startoff >= split_fsb)
+ return 0;
+
+ gotblkcnt = split_fsb - got.br_startoff;
+ new.br_startoff = split_fsb;
+ new.br_startblock = got.br_startblock + gotblkcnt;
+ new.br_blockcount = got.br_blockcount - gotblkcnt;
+ new.br_state = got.br_state;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_ino.flags = 0;
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
+ goto del_cursor;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+ }
+
+ got.br_blockcount = gotblkcnt;
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur,
+ &got);
+
+ logflags = XFS_ILOG_CORE;
+ if (cur) {
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto del_cursor;
+ } else
+ logflags |= XFS_ILOG_DEXT;
+
+ /* Add new extent */
+ xfs_iext_next(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &new, 0);
+ ifp->if_nextents++;
+
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, &new, &i);
+ if (error)
+ goto del_cursor;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto del_cursor;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
+ }
+
+ /*
+ * Convert to a btree if necessary.
+ */
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
+ &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ }
+
+del_cursor:
+ if (cur) {
+ cur->bc_ino.allocated = 0;
+ xfs_btree_del_cursor(cur, error);
+ }
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+ struct xfs_bmbt_irec *bmap)
+{
+ return bmap->br_startblock != HOLESTARTBLOCK &&
+ bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+ struct xfs_trans *tp,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ struct xfs_bmap_intent *bi;
+
+ trace_xfs_bmap_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+ ip->i_ino, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&bi->bi_list);
+ bi->bi_type = type;
+ bi->bi_owner = ip;
+ bi->bi_whichfork = whichfork;
+ bi->bi_bmap = *bmap;
+
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+ return 0;
+}
+
+/* Map an extent into a file. */
+void
+xfs_bmap_map_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return;
+
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+void
+xfs_bmap_unmap_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return;
+
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations. We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t *blockcount,
+ xfs_exntst_t state)
+{
+ int error = 0;
+
+ ASSERT(tp->t_firstblock == NULLFSBLOCK);
+
+ trace_xfs_bmap_deferred(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+ ip->i_ino, whichfork, startoff, *blockcount, state);
+
+ if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
+ return -EFSCORRUPTED;
+
+ if (XFS_TEST_ERROR(false, tp->t_mountp,
+ XFS_ERRTAG_BMAP_FINISH_ONE))
+ return -EIO;
+
+ switch (type) {
+ case XFS_BMAP_MAP:
+ error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
+ startblock, 0);
+ *blockcount = 0;
+ break;
+ case XFS_BMAP_UNMAP:
+ error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+
+ return error;
+}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ return __this_address;
+
+ if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+ if (!xfs_verify_rtext(mp, irec->br_startblock,
+ irec->br_blockcount))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbext(mp, irec->br_startblock,
+ irec->br_blockcount))
+ return __this_address;
+ }
+ if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+ return __this_address;
+ return NULL;
+}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+ xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+ sizeof(struct xfs_bmap_intent),
+ 0, 0, NULL);
+
+ return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_bmap_intent_cache);
+ xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644
index 000000000..16db95b11
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct xfs_bmbt_irec prev; /* extent before the new one */
+ struct xfs_bmbt_irec got; /* extent after, or delayed */
+
+ xfs_fileoff_t offset; /* offset in file filling in */
+ xfs_extlen_t length; /* i/o length asked/allocated */
+ xfs_fsblock_t blkno; /* starting block of new extent */
+
+ struct xfs_btree_cur *cur; /* btree cursor */
+ struct xfs_iext_cursor icur; /* incore extent cursor */
+ int nallocs;/* number of extents alloc'd */
+ int logflags;/* flags for transaction logging */
+
+ xfs_extlen_t total; /* total blocks needed for xaction */
+ xfs_extlen_t minlen; /* minimum allocation size (blocks) */
+ xfs_extlen_t minleft; /* amount must be left after alloc */
+ bool eof; /* set if allocating past last extent */
+ bool wasdel; /* replacing a delayed allocation */
+ bool aeof; /* allocated space at eof */
+ bool conv; /* overwriting unwritten extents */
+ int datatype;/* data type being allocated */
+ uint32_t flags;
+};
+
+#define XFS_BMAP_MAX_NMAP 4
+
+/*
+ * Flags for xfs_bmapi_*
+ */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT (1u << 5)
+
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO (1u << 6)
+
+/*
+ * Map the inode offset to the block given in ap->firstblock. Primarily
+ * used for reflink. The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP (1u << 7)
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK (1u << 8)
+
+/* Skip online discard of freed extents */
+#define XFS_BMAPI_NODISCARD (1u << 9)
+
+/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
+#define XFS_BMAPI_NORMAP (1u << 10)
+
+#define XFS_BMAPI_FLAGS \
+ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
+ { XFS_BMAPI_METADATA, "METADATA" }, \
+ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
+ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
+ { XFS_BMAPI_CONTIG, "CONTIG" }, \
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }, \
+ { XFS_BMAPI_REMAP, "REMAP" }, \
+ { XFS_BMAPI_COWFORK, "COWFORK" }, \
+ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \
+ { XFS_BMAPI_NORMAP, "NORMAP" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+ return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+ (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
+{
+ if (bmapi_flags & XFS_BMAPI_COWFORK)
+ return XFS_COW_FORK;
+ else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
+
+#define XFS_BMAP_EXT_FLAGS \
+ { BMAP_LEFT_CONTIG, "LC" }, \
+ { BMAP_RIGHT_CONTIG, "RC" }, \
+ { BMAP_LEFT_FILLING, "LF" }, \
+ { BMAP_RIGHT_FILLING, "RF" }, \
+ { BMAP_ATTRFORK, "ATTR" }, \
+ { BMAP_COWFORK, "COW" }
+
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
+
+/*
+ * Return true if the extent is a real, allocated extent, or false if it is a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
+{
+ return xfs_bmap_is_real_extent(irec) &&
+ irec->br_state != XFS_EXT_UNWRITTEN;
+}
+
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
+void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
+ xfs_filblks_t len);
+unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
+int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
+void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *last_block, int whichfork);
+int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+ int whichfork);
+int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+ xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+ int *nmap, uint32_t flags);
+int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
+ xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
+ xfs_extnum_t nexts);
+int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
+ xfs_extnum_t nexts, int *done);
+int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+uint xfs_default_attroffset(struct xfs_inode *ip);
+int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+ bool *done);
+int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
+ xfs_fileoff_t shift);
+int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+ bool *done, xfs_fileoff_t stop_fsb);
+int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t split_offset);
+int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
+ int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
+
+enum xfs_bmap_intent_type {
+ XFS_BMAP_MAP = 1,
+ XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+ struct list_head bi_list;
+ enum xfs_bmap_intent_type bi_type;
+ int bi_whichfork;
+ struct xfs_inode *bi_owner;
+ struct xfs_bmbt_irec bi_bmap;
+};
+
+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
+ enum xfs_bmap_intent_type type, int whichfork,
+ xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+ xfs_filblks_t *blockcount, xfs_exntst_t state);
+void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap);
+void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap);
+
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
+{
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ return BMAP_ATTRFORK;
+ case XFS_COW_FORK:
+ return BMAP_COWFORK;
+ default:
+ return 0;
+ }
+}
+
+xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *irec);
+
+int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
+ uint32_t flags);
+
+extern struct kmem_cache *xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
+
+#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000..cfa052d40
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_rmap.h"
+
+static struct kmem_cache *xfs_bmbt_cur_cache;
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+ struct xfs_inode *ip,
+ xfs_bmdr_block_t *dblock,
+ int dblocklen,
+ struct xfs_btree_block *rblock,
+ int rblocklen)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int dmxr;
+ xfs_bmbt_key_t *fkp;
+ __be64 *fpp;
+ xfs_bmbt_key_t *tkp;
+ __be64 *tpp;
+
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+ rblock->bb_level = dblock->bb_level;
+ ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+ rblock->bb_numrecs = dblock->bb_numrecs;
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+ fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ dmxr = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+void
+xfs_bmbt_disk_get_all(
+ const struct xfs_bmbt_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ uint64_t l0 = get_unaligned_be64(&rec->l0);
+ uint64_t l1 = get_unaligned_be64(&rec->l1);
+
+ irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+ irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21);
+ irec->br_blockcount = l1 & xfs_mask64lo(21);
+ if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+ const struct xfs_bmbt_rec *r)
+{
+ return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+ const struct xfs_bmbt_rec *r)
+{
+ return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_disk_set_all(
+ struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s)
+{
+ int extent_flag = (s->br_state != XFS_EXT_NORM);
+
+ ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN);
+ ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)));
+ ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)));
+ ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)));
+
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+ ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0);
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+ ((xfs_bmbt_rec_base_t)s->br_blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ int rblocklen,
+ xfs_bmdr_block_t *dblock,
+ int dblocklen)
+{
+ int dmxr;
+ xfs_bmbt_key_t *fkp;
+ __be64 *fpp;
+ xfs_bmbt_key_t *tkp;
+ __be64 *tpp;
+
+ if (xfs_has_crc(mp)) {
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno ==
+ cpu_to_be64(XFS_BUF_DADDR_NULL));
+ } else
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_level != 0);
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+ fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ dmxr = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *new;
+
+ new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_ino.ip, cur->bc_ino.whichfork);
+
+ /*
+ * Copy the firstblock, dfops, and flags values,
+ * since init cursor doesn't get them.
+ */
+ new->bc_ino.flags = cur->bc_ino.flags;
+
+ return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+ struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst)
+{
+ ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
+ (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
+
+ dst->bc_ino.allocated += src->bc_ino.allocated;
+ dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
+
+ src->bc_ino.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = cur->bc_tp->t_firstblock;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork);
+
+ if (args.fsbno == NULLFSBLOCK) {
+ args.fsbno = be64_to_cpu(start->l);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+ * we are converting the middle part of an extent then
+ * we may need space for two tree splits.
+ *
+ * We are relying on the caller to make the correct block
+ * reservation for this operation to succeed. If the
+ * reservation amount is insufficient then we may fail a
+ * block allocation here and corrupt the filesystem.
+ */
+ args.minleft = args.tp->t_blk_res;
+ } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+ if (!args.wasdel && args.tp->t_blk_res == 0) {
+ error = -ENOSPC;
+ goto error0;
+ }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+
+ if (args.fsbno == NULLFSBLOCK && args.minleft) {
+ /*
+ * Could not find an AG with enough free space to satisfy
+ * a full btree split. Try again and if
+ * successful activate the lowspace algorithm.
+ */
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+ cur->bc_tp->t_flags |= XFS_TRANS_LOWMODE;
+ }
+ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+ *stat = 0;
+ return 0;
+ }
+
+ ASSERT(args.len == 1);
+ cur->bc_tp->t_firstblock = args.fsbno;
+ cur->bc_ino.allocated++;
+ cur->bc_ino.ip->i_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+
+ new->l = cpu_to_be64(args.fsbno);
+
+ *stat = 1;
+ return 0;
+
+ error0:
+ return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
+ ip->i_nblocks--;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0);
+ }
+
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+ return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff =
+ cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff = cpu_to_be64(
+ xfs_bmbt_disk_get_startoff(&rec->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&rec->bmbt) - 1);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_bmbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+ cur->bc_rec.b.br_startoff;
+}
+
+STATIC int64_t
+xfs_bmbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
+ uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+
+ /*
+ * Note: This routine previously casted a and b to int64 and subtracted
+ * them to generate a result. This lead to problems if b was the
+ * "maximum" key value (all ones) being signed incorrectly, hence this
+ * somewhat less efficient version.
+ */
+ if (a > b)
+ return 1;
+ if (b > a)
+ return -1;
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_bmbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ /*
+ * XXX: need a better way of verifying the owner here. Right now
+ * just make sure there has been one set.
+ */
+ fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ }
+
+ /*
+ * numrecs and level verification.
+ *
+ * We don't know what fork we belong to, so just verify that the level
+ * is less than the maximum of the two. Later checks will be more
+ * precise.
+ */
+ level = be16_to_cpu(block->bb_level);
+ if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+ return __this_address;
+
+ return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+}
+
+static void
+xfs_bmbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_lblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_bmbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_bmbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_bmbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
+ .verify_read = xfs_bmbt_read_verify,
+ .verify_write = xfs_bmbt_write_verify,
+ .verify_struct = xfs_bmbt_verify,
+};
+
+
+STATIC int
+xfs_bmbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be64_to_cpu(k1->bmbt.br_startoff) <
+ be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+ xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+ .rec_len = sizeof(xfs_bmbt_rec_t),
+ .key_len = sizeof(xfs_bmbt_key_t),
+
+ .dup_cursor = xfs_bmbt_dup_cursor,
+ .update_cursor = xfs_bmbt_update_cursor,
+ .alloc_block = xfs_bmbt_alloc_block,
+ .free_block = xfs_bmbt_free_block,
+ .get_maxrecs = xfs_bmbt_get_maxrecs,
+ .get_minrecs = xfs_bmbt_get_minrecs,
+ .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_bmbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
+ .key_diff = xfs_bmbt_key_diff,
+ .diff_two_keys = xfs_bmbt_diff_two_keys,
+ .buf_ops = &xfs_bmbt_buf_ops,
+ .keys_inorder = xfs_bmbt_keys_inorder,
+ .recs_inorder = xfs_bmbt_recs_inorder,
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur * /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* inode owning the btree */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur;
+ ASSERT(whichfork != XFS_COW_FORK);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+ mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
+
+ cur->bc_ops = &xfs_bmbt_ops;
+ cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+ if (xfs_has_crc(mp))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ cur->bc_ino.ip = ip;
+ cur->bc_ino.allocated = 0;
+ cur->bc_ino.flags = 0;
+ cur->bc_ino.whichfork = whichfork;
+
+ return cur;
+}
+
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
+
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+ /* One extra level for the inode root. */
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+ int blocklen,
+ int leaf)
+{
+ blocklen -= sizeof(xfs_bmdr_block_t);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmdr_rec_t);
+ return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_ino_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(tp || buffer_list);
+ ASSERT(!(tp && buffer_list));
+ ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
+
+ cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+
+ error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Calculate the bmap btree size for some records. */
+unsigned long long
+xfs_bmbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
+}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+ xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+ xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_bmbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_bmbt_cur_cache);
+ xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644
index 000000000..3e7a40a83
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) \
+ (xfs_has_crc(((mp))) ? \
+ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+ ((xfs_bmbt_rec_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+ ((xfs_bmbt_key_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_bmbt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+ ((xfs_bmdr_rec_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+ ((xfs_bmdr_key_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_bmdr_ptr_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+ (int)(XFS_BMBT_BLOCK_LEN(mp) + \
+ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+ (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+ (int)(sizeof(xfs_bmdr_block_t) + \
+ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+ struct xfs_btree_block *, int);
+
+void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r);
+void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+ xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_ino_t new_owner,
+ struct list_head *buffer_list);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_inode *, int);
+
+extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
+#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000..4c16c8c31
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,5099 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_log.h"
+#include "xfs_btree_staging.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+
+/*
+ * Btree magic numbers.
+ */
+static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC, 0 },
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+ XFS_REFC_CRC_MAGIC }
+};
+
+uint32_t
+xfs_btree_magic(
+ int crc,
+ xfs_btnum_t btnum)
+{
+ uint32_t magic = xfs_magics[crc][btnum];
+
+ /* Ensure we asked for crc for crc-only magics. */
+ ASSERT(magic != 0);
+ return magic;
+}
+
+/*
+ * These sibling pointer checks are optimised for null sibling pointers. This
+ * happens a lot, and we don't need to byte swap at runtime if the sibling
+ * pointer is NULL.
+ *
+ * These are explicitly marked at inline because the cost of calling them as
+ * functions instead of inlining them is about 36 bytes extra code per call site
+ * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
+ * two sibling check functions reduces the compiled code size by over 300
+ * bytes.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ __be64 dsibling)
+{
+ xfs_fsblock_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_perag *pag,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agblock_t agbno,
+ __be32 dsibling)
+{
+ xfs_agblock_t sibling;
+
+ if (dsibling == cpu_to_be32(NULLAGBLOCK))
+ return NULL;
+
+ sibling = be32_to_cpu(dsibling);
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_btnum_t btnum = cur->bc_btnum;
+ int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
+
+ if (crc) {
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.l.bb_blkno !=
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+ return __this_address;
+ if (block->bb_u.l.bb_pad != cpu_to_be32(0))
+ return __this_address;
+ }
+
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
+}
+
+/* Check a long btree block header. */
+static int
+xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_lblock(cur, block, level, bp);
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/*
+ * Check a short btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ xfs_btnum_t btnum = cur->bc_btnum;
+ int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
+
+ if (crc) {
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.s.bb_blkno !=
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+ return __this_address;
+ }
+
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+
+ if (bp)
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
+}
+
+/* Check a short btree block header. */
+STATIC int
+xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_sblock(cur, block, level, bp);
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer containing block, if any */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_check_lblock(cur, block, level, bp);
+ else
+ return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/* Check that this long pointer is valid and points within the fs. */
+bool
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t fsbno,
+ int level)
+{
+ if (level <= 0)
+ return false;
+ return xfs_verify_fsbno(cur->bc_mp, fsbno);
+}
+
+/* Check that this short pointer is valid and points within the AG. */
+bool
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ int level)
+{
+ if (level <= 0)
+ return false;
+ return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+}
+
+/*
+ * Check that a given (indexed) btree pointer at a certain level of a
+ * btree is valid and doesn't point past where it should.
+ */
+static int
+xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
+ level))
+ return 0;
+ xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+ cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork, cur->bc_btnum,
+ level, index);
+ } else {
+ if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
+ level))
+ return 0;
+ xfs_err(cur->bc_mp,
+"AG %u: Corrupt btree %d pointer at level %d index %d.",
+ cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+ level, index);
+ }
+
+ return -EFSCORRUPTED;
+}
+
+#ifdef DEBUG
+# define xfs_btree_debug_check_ptr xfs_btree_check_ptr
+#else
+# define xfs_btree_debug_check_ptr(...) (0)
+#endif
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modification was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ if (!xfs_has_crc(bp->b_mount))
+ return;
+ if (bip)
+ block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
+
+ return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modification was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ if (!xfs_has_crc(bp->b_mount))
+ return;
+ if (bip)
+ block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
+
+ return true;
+}
+
+static int
+xfs_btree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ int error;
+
+ error = cur->bc_ops->free_block(cur, bp);
+ if (!error) {
+ xfs_trans_binval(cur->bc_tp, bp);
+ XFS_BTREE_STATS_INC(cur, free);
+ }
+ return error;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
+{
+ int i; /* btree level */
+
+ /*
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
+ */
+ for (i = 0; i < cur->bc_nlevels; i++) {
+ if (cur->bc_levels[i].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
+ else if (!error)
+ break;
+ }
+
+ /*
+ * If we are doing a BMBT update, the number of unaccounted blocks
+ * allocated during this cursor life time should be zero. If it's not
+ * zero, then we should be shut down or on our way to shutdown due to
+ * cancelling a dirty transaction on error.
+ */
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ xfs_is_shutdown(cur->bc_mp) || error != 0);
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+ kmem_free(cur->bc_ops);
+ if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
+ kmem_cache_free(cur->bc_cache, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int /* error */
+xfs_btree_dup_cursor(
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
+{
+ struct xfs_buf *bp; /* btree block's buffer pointer */
+ int error; /* error return value */
+ int i; /* level number of btree block */
+ xfs_mount_t *mp; /* mount structure for filesystem */
+ struct xfs_btree_cur *new; /* new cursor value */
+ xfs_trans_t *tp; /* transaction pointer, can be NULL */
+
+ tp = cur->bc_tp;
+ mp = cur->bc_mp;
+
+ /*
+ * Allocate a new cursor like the old one.
+ */
+ new = cur->bc_ops->dup_cursor(cur);
+
+ /*
+ * Copy the record currently in the cursor.
+ */
+ new->bc_rec = cur->bc_rec;
+
+ /*
+ * For each level current, re-get the buffer and copy the ptr value.
+ */
+ for (i = 0; i < new->bc_nlevels; i++) {
+ new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+ new->bc_levels[i].ra = cur->bc_levels[i].ra;
+ bp = cur->bc_levels[i].bp;
+ if (bp) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ xfs_buf_daddr(bp), mp->m_bsize,
+ 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (error) {
+ xfs_btree_del_cursor(new, error);
+ *ncur = NULL;
+ return error;
+ }
+ }
+ new->bc_levels[i].bp = bp;
+ }
+ *ncur = new;
+ return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values. A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers. The record and key structures are defined by the btree instances
+ * and opaque to the btree core. The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr). Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ *
+ * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * overlapping intervals. In such a tree, records are still sorted lowest to
+ * highest and indexed by the smallest key value that refers to the record.
+ * However, nodes are different: each pointer has two associated keys -- one
+ * indexing the lowest key available in the block(s) below (the same behavior
+ * as the key in a regular btree) and another indexing the highest key
+ * available in the block(s) below. Because records are /not/ sorted by the
+ * highest key, all leaf block updates require us to compute the highest key
+ * that matches any record in the leaf and to recursively update the high keys
+ * in the nodes going further up in the tree, if necessary. Nodes look like
+ * this:
+ *
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ *
+ * To perform an interval query on an overlapped tree, perform the usual
+ * depth-first search and use the low and high keys to decide if we can skip
+ * that particular node. If a leaf node is reached, return the records that
+ * intersect the interval. Note that an interval query may return numerous
+ * entries. For a non-overlapped tree, simply search for the record associated
+ * with the lowest key and iterate forward until a non-matching record is
+ * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
+ * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
+ * more detail.
+ *
+ * Why do we care about overlapping intervals? Let's say you have a bunch of
+ * reverse mapping records on a reflink filesystem:
+ *
+ * 1: +- file A startblock B offset C length D -----------+
+ * 2: +- file E startblock F offset G length H --------------+
+ * 3: +- file I startblock F offset J length K --+
+ * 4: +- file L... --+
+ *
+ * Now say we want to map block (B+D) into file A at offset (C+D). Ideally,
+ * we'd simply increment the length of record 1. But how do we find the record
+ * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return
+ * record 3 because the keys are ordered first by startblock. An interval
+ * query would return records 1 and 2 because they both overlap (B+D-1), and
+ * from that we can pick out record 1 as the appropriate left neighbor.
+ *
+ * In the non-overlapped case you can do a LE lookup and decrement the cursor
+ * because a record's interval must end before the next record.
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_LBLOCK_CRC_LEN;
+ return XFS_BTREE_LBLOCK_LEN;
+ }
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_SBLOCK_CRC_LEN;
+ return XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th high key in a btree block.
+ */
+STATIC size_t
+xfs_btree_high_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+ struct xfs_btree_cur *cur,
+ int n,
+ int level)
+{
+ return xfs_btree_block_len(cur) +
+ cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+ (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+union xfs_btree_rec *
+xfs_btree_rec_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_rec *)
+ ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+union xfs_btree_key *
+xfs_btree_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th high key in the btree block.
+ */
+union xfs_btree_key *
+xfs_btree_high_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_high_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ int level = xfs_btree_get_level(block);
+
+ ASSERT(block->bb_level != 0);
+
+ return (union xfs_btree_ptr *)
+ ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ if (cur->bc_flags & XFS_BTREE_STAGING)
+ return cur->bc_ino.ifake->if_fork;
+ return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+struct xfs_btree_block * /* generic btree block pointer */
+xfs_btree_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in btree */
+ struct xfs_buf **bpp) /* buffer containing the block */
+{
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *bpp = NULL;
+ return xfs_btree_get_iroot(cur);
+ }
+
+ *bpp = cur->bc_levels[level].bp;
+ return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int /* success=1, failure=0 */
+xfs_btree_firstrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level) /* level to change */
+{
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ struct xfs_buf *bp; /* buffer containing block */
+
+ /*
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (xfs_btree_check_block(cur, block, level, bp))
+ return 0;
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to 1, that's the first record/key.
+ */
+ cur->bc_levels[level].ptr = 1;
+ return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level. Other levels are unaffected.
+ */
+STATIC int /* success=1, failure=0 */
+xfs_btree_lastrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level) /* level to change */
+{
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ struct xfs_buf *bp; /* buffer containing block */
+
+ /*
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (xfs_btree_check_block(cur, block, level, bp))
+ return 0;
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to numrecs, that's the last record/key.
+ */
+ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
+ return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+ uint32_t fields, /* bitmask of fields */
+ const short *offsets, /* table of field offsets */
+ int nbits, /* number of bits to inspect */
+ int *first, /* output: first byte offset */
+ int *last) /* output: last byte offset */
+{
+ int i; /* current bit number */
+ uint32_t imask; /* mask for current bit number */
+
+ ASSERT(fields != 0);
+ /*
+ * Find the lowest bit, so the first byte offset.
+ */
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
+ if (imask & fields) {
+ *first = offsets[i];
+ break;
+ }
+ }
+ /*
+ * Find the highest bit, so the last byte offset.
+ */
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
+ if (imask & fields) {
+ *last = offsets[i + 1] - 1;
+ break;
+ }
+ }
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp; /* return value */
+ xfs_daddr_t d; /* real disk block address */
+ int error;
+
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return -EFSCORRUPTED;
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0, &bp, ops);
+ if (error)
+ return error;
+ if (bp)
+ xfs_buf_set_ref(bp, refval);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
+{
+ xfs_daddr_t d;
+
+ ASSERT(fsbno != NULLFSBLOCK);
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
+{
+ xfs_daddr_t d;
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agbno != NULLAGBLOCK);
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+ cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+ cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+ xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ left, 1, cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ right, 1, cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int lev, /* level in btree */
+ int lr) /* left/right bits */
+{
+ struct xfs_btree_block *block;
+
+ /*
+ * No readahead needed if we are at the root level and the
+ * btree root is stored in the inode.
+ */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (lev == cur->bc_nlevels - 1))
+ return 0;
+
+ if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
+ return 0;
+
+ cur->bc_levels[lev].ra |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_readahead_lblock(cur, lr, block);
+ return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC int
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ xfs_daddr_t *daddr)
+{
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+ int error;
+
+ error = xfs_btree_check_ptr(cur, ptr, 0, 1);
+ if (error)
+ return error;
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ fsbno = be64_to_cpu(ptr->l);
+ *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
+ } else {
+ agbno = be32_to_cpu(ptr->s);
+ *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ agbno);
+ }
+
+ return 0;
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ xfs_extlen_t count)
+{
+ xfs_daddr_t daddr;
+
+ if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
+ return;
+ xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
+ cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int lev, /* level in btree */
+ struct xfs_buf *bp) /* new buffer to set */
+{
+ struct xfs_btree_block *b; /* btree block */
+
+ if (cur->bc_levels[lev].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+ cur->bc_levels[lev].bp = bp;
+ cur->bc_levels[lev].ra = 0;
+
+ b = XFS_BUF_TO_BLOCK(bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
+ if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
+ } else {
+ if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
+ if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
+ }
+}
+
+bool
+xfs_btree_ptr_is_null(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return ptr->l == cpu_to_be64(NULLFSBLOCK);
+ else
+ return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+void
+xfs_btree_set_ptr_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(NULLFSBLOCK);
+ else
+ ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+void
+xfs_btree_get_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->l = block->bb_u.l.bb_rightsib;
+ else
+ ptr->l = block->bb_u.l.bb_leftsib;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->s = block->bb_u.s.bb_rightsib;
+ else
+ ptr->s = block->bb_u.s.bb_leftsib;
+ }
+}
+
+void
+xfs_btree_set_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ const union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.l.bb_rightsib = ptr->l;
+ else
+ block->bb_u.l.bb_leftsib = ptr->l;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.s.bb_rightsib = ptr->s;
+ else
+ block->bb_u.s.bb_leftsib = ptr->s;
+ }
+}
+
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ xfs_btnum_t btnum,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags)
+{
+ int crc = xfs_has_crc(mp);
+ __u32 magic = xfs_btree_magic(crc, btnum);
+
+ buf->bb_magic = cpu_to_be32(magic);
+ buf->bb_level = cpu_to_be16(level);
+ buf->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (flags & XFS_BTREE_LONG_PTRS) {
+ buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+ buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+ if (crc) {
+ buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
+ buf->bb_u.l.bb_pad = 0;
+ buf->bb_u.l.bb_lsn = 0;
+ }
+ } else {
+ /* owner is a 32 bit value on short blocks */
+ __u32 __owner = (__u32)owner;
+
+ buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ if (crc) {
+ buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
+ buf->bb_u.s.bb_lsn = 0;
+ }
+ }
+}
+
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_btnum_t btnum,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
+ btnum, level, numrecs, owner, 0);
+}
+
+void
+xfs_btree_init_block_cur(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ int numrecs)
+{
+ __u64 owner;
+
+ /*
+ * we can pull the owner from the cursor right now as the different
+ * owners align directly with the pointer size of the btree. This may
+ * change in future, but is safe for current users of the generic btree
+ * code.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ owner = cur->bc_ino.ip->i_ino;
+ else
+ owner = cur->bc_ag.pag->pag_agno;
+
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
+ xfs_buf_daddr(bp), cur->bc_btnum, level,
+ numrecs, owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record. The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level)
+{
+ union xfs_btree_ptr ptr;
+
+ if (level > 0)
+ return 0;
+ if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ return 0;
+
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &ptr))
+ return 0;
+ return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)));
+ else {
+ ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)));
+ }
+}
+
+STATIC void
+xfs_btree_set_refs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ switch (cur->bc_btnum) {
+ case XFS_BTNUM_BNO:
+ case XFS_BTNUM_CNT:
+ xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+ break;
+ case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
+ xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+ break;
+ case XFS_BTNUM_BMAP:
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+ break;
+ case XFS_BTNUM_RMAP:
+ xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
+ break;
+ case XFS_BTNUM_REFC:
+ xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+int
+xfs_btree_get_buf_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+ int error;
+
+ error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
+ if (error)
+ return error;
+ error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
+ 0, bpp);
+ if (error)
+ return error;
+
+ (*bpp)->b_ops = cur->bc_ops->buf_ops;
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+ int error;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XBF_TRYLOCK));
+
+ error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
+ if (error)
+ return error;
+ error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags, bpp,
+ cur->bc_ops->buf_ops);
+ if (error)
+ return error;
+
+ xfs_btree_set_refs(cur, *bpp);
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+void
+xfs_btree_copy_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key,
+ int numkeys)
+{
+ ASSERT(numkeys >= 0);
+ memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *dst_rec,
+ union xfs_btree_rec *src_rec,
+ int numrecs)
+{
+ ASSERT(numrecs >= 0);
+ memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+void
+xfs_btree_copy_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ const union xfs_btree_ptr *src_ptr,
+ int numptrs)
+{
+ ASSERT(numptrs >= 0);
+ memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ int dir,
+ int numkeys)
+{
+ char *dst_key;
+
+ ASSERT(numkeys >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+ memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int dir,
+ int numrecs)
+{
+ char *dst_rec;
+
+ ASSERT(numrecs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+ memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int dir,
+ int numptrs)
+{
+ char *dst_ptr;
+
+ ASSERT(numptrs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+ memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+
+ if (bp) {
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_key_offset(cur, first),
+ xfs_btree_key_offset(cur, last + 1) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ }
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_rec_offset(cur, first),
+ xfs_btree_rec_offset(cur, last + 1) - 1);
+
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int first, /* index of first pointer to log */
+ int last) /* index of last pointer to log */
+{
+
+ if (bp) {
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ int level = xfs_btree_get_level(block);
+
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_ptr_offset(cur, first, level),
+ xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ }
+
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
+{
+ int first; /* first byte offset logged */
+ int last; /* last byte offset logged */
+ static const short soffsets[] = { /* table of offsets (short) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+ XFS_BTREE_SBLOCK_CRC_LEN
+ };
+ static const short loffsets[] = { /* table of offsets (long) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+ XFS_BTREE_LBLOCK_CRC_LEN
+ };
+
+ if (bp) {
+ int nbits;
+
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ /*
+ * We don't log the CRC when updating a btree
+ * block but instead recreate it during log
+ * recovery. As the log buffers have checksums
+ * of their own this is safe and avoids logging a crc
+ * update in a lot of places.
+ */
+ if (fields == XFS_BB_ALL_BITS)
+ fields = XFS_BB_ALL_BITS_CRC;
+ nbits = XFS_BB_NUM_BITS_CRC;
+ } else {
+ nbits = XFS_BB_NUM_BITS;
+ }
+ xfs_btree_offsets(fields,
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ loffsets : soffsets,
+ nbits, &first, &last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ }
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_increment(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+ int error; /* error return value */
+ int lev;
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the right at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* We're done if we remain in the block after the increment. */
+ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
+ goto out1;
+
+ /* Fail if we just went off the right edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, increment);
+
+ /*
+ * March up the tree incrementing pointers.
+ * Stop when we don't go off the right edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, lev, bp);
+ if (error)
+ goto error0;
+#endif
+
+ if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
+ break;
+
+ /* Read-ahead the right block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+ }
+
+ /*
+ * If we went off the root then we are either seriously
+ * confused or have the tree root in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+ if (error)
+ goto error0;
+
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_levels[lev].ptr = 1;
+ }
+out1:
+ *stat = 1;
+ return 0;
+
+out0:
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_decrement(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ int error; /* error return value */
+ int lev;
+ union xfs_btree_ptr ptr;
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the left at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+ /* We're done if we remain in the block after the decrement. */
+ if (--cur->bc_levels[level].ptr > 0)
+ goto out1;
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we just went off the left edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, decrement);
+
+ /*
+ * March up the tree decrementing pointers.
+ * Stop when we don't go off the left edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ if (--cur->bc_levels[lev].ptr > 0)
+ break;
+ /* Read-ahead the left block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+ }
+
+ /*
+ * If we went off the root then we are seriously confused.
+ * or the root of the tree is in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+ if (error)
+ goto error0;
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
+ }
+out1:
+ *stat = 1;
+ return 0;
+
+out0:
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+}
+
+int
+xfs_btree_lookup_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ const union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
+{
+ struct xfs_buf *bp; /* buffer pointer for btree block */
+ xfs_daddr_t daddr;
+ int error = 0;
+
+ /* special case the root block if in an inode */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *blkp = xfs_btree_get_iroot(cur);
+ return 0;
+ }
+
+ /*
+ * If the old buffer at this level for the disk address we are
+ * looking for re-use it.
+ *
+ * Otherwise throw it away and get a new one.
+ */
+ bp = cur->bc_levels[level].bp;
+ error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
+ if (error)
+ return error;
+ if (bp && xfs_buf_daddr(bp) == daddr) {
+ *blkp = XFS_BUF_TO_BLOCK(bp);
+ return 0;
+ }
+
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+ if (error)
+ return error;
+
+ /* Check the inode owner since the verifiers don't. */
+ if (xfs_has_crc(cur->bc_mp) &&
+ !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ cur->bc_ino.ip->i_ino)
+ goto out_bad;
+
+ /* Did we get the level we were looking for? */
+ if (be16_to_cpu((*blkp)->bb_level) != level)
+ goto out_bad;
+
+ /* Check that internal nodes have at least one record. */
+ if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+ goto out_bad;
+
+ xfs_btree_setbuf(cur, level, bp);
+ return 0;
+
+out_bad:
+ *blkp = NULL;
+ xfs_buf_mark_corrupt(bp);
+ xfs_trans_brelse(cur->bc_tp, bp);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Get current search key. For level 0 we don't actually have a key
+ * structure so we make one up from the record. For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+ struct xfs_btree_cur *cur,
+ int level,
+ int keyno,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *kp)
+{
+ if (level == 0) {
+ cur->bc_ops->init_key_from_rec(kp,
+ xfs_btree_rec_addr(cur, keyno, block));
+ return kp;
+ }
+
+ return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record. The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int /* error */
+xfs_btree_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_lookup_t dir, /* <=, ==, or >= */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ int64_t diff; /* difference for the current key */
+ int error; /* error return value */
+ int keyno; /* current key number */
+ int level; /* level in the btree */
+ union xfs_btree_ptr *pp; /* ptr to btree block */
+ union xfs_btree_ptr ptr; /* ptr to btree block */
+
+ XFS_BTREE_STATS_INC(cur, lookup);
+
+ /* No such thing as a zero-level tree. */
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+ return -EFSCORRUPTED;
+
+ block = NULL;
+ keyno = 0;
+
+ /* initialise start pointer from cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ pp = &ptr;
+
+ /*
+ * Iterate over each level in the btree, starting at the root.
+ * For each level above the leaves, find the key we need, based
+ * on the lookup record, then follow the corresponding block
+ * pointer down to the next level.
+ */
+ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ /* Get the block we need to do the lookup on. */
+ error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+ if (error)
+ goto error0;
+
+ if (diff == 0) {
+ /*
+ * If we already had a key match at a higher level, we
+ * know we need to use the first entry in this block.
+ */
+ keyno = 1;
+ } else {
+ /* Otherwise search this block. Do a binary search. */
+
+ int high; /* high entry number */
+ int low; /* low entry number */
+
+ /* Set low and high entry numbers, 1-based. */
+ low = 1;
+ high = xfs_btree_get_numrecs(block);
+ if (!high) {
+ /* Block is empty, must be an empty leaf. */
+ if (level != 0 || cur->bc_nlevels != 1) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW,
+ cur->bc_mp, block,
+ sizeof(*block));
+ return -EFSCORRUPTED;
+ }
+
+ cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
+ *stat = 0;
+ return 0;
+ }
+
+ /* Binary search the block. */
+ while (low <= high) {
+ union xfs_btree_key key;
+ union xfs_btree_key *kp;
+
+ XFS_BTREE_STATS_INC(cur, compare);
+
+ /* keyno is average of low and high. */
+ keyno = (low + high) >> 1;
+
+ /* Get current search key */
+ kp = xfs_lookup_get_search_key(cur, level,
+ keyno, block, &key);
+
+ /*
+ * Compute difference to get next direction:
+ * - less than, move right
+ * - greater than, move left
+ * - equal, we're done
+ */
+ diff = cur->bc_ops->key_diff(cur, kp);
+ if (diff < 0)
+ low = keyno + 1;
+ else if (diff > 0)
+ high = keyno - 1;
+ else
+ break;
+ }
+ }
+
+ /*
+ * If there are more levels, set up for the next level
+ * by getting the block number and filling in the cursor.
+ */
+ if (level > 0) {
+ /*
+ * If we moved left, need the previous key number,
+ * unless there isn't one.
+ */
+ if (diff > 0 && --keyno < 1)
+ keyno = 1;
+ pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+ error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
+ if (error)
+ goto error0;
+
+ cur->bc_levels[level].ptr = keyno;
+ }
+ }
+
+ /* Done with the search. See if we need to adjust the results. */
+ if (dir != XFS_LOOKUP_LE && diff < 0) {
+ keyno++;
+ /*
+ * If ge search and we went off the end of the block, but it's
+ * not the last block, we're in the wrong block.
+ */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (dir == XFS_LOOKUP_GE &&
+ keyno > xfs_btree_get_numrecs(block) &&
+ !xfs_btree_ptr_is_null(cur, &ptr)) {
+ int i;
+
+ cur->bc_levels[0].ptr = keyno;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+ *stat = 1;
+ return 0;
+ }
+ } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ keyno--;
+ cur->bc_levels[0].ptr = keyno;
+
+ /* Return if we succeeded or not. */
+ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+ *stat = 0;
+ else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ *stat = 1;
+ else
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+}
+
+/* Find the high key storage area from a regular key. */
+union xfs_btree_key *
+xfs_btree_high_key_from_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ return (union xfs_btree_key *)((char *)key +
+ (cur->bc_ops->key_len / 2));
+}
+
+/* Determine the low (and high if overlapped) keys of a leaf block */
+STATIC void
+xfs_btree_get_leaf_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key max_hkey;
+ union xfs_btree_key hkey;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key *high;
+ int n;
+
+ rec = xfs_btree_rec_addr(cur, 1, block);
+ cur->bc_ops->init_key_from_rec(key, rec);
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+
+ cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ rec = xfs_btree_rec_addr(cur, n, block);
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
+ > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
+ }
+}
+
+/* Determine the low (and high if overlapped) keys of a node block */
+STATIC void
+xfs_btree_get_node_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *max_hkey;
+ union xfs_btree_key *high;
+ int n;
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len / 2);
+
+ max_hkey = xfs_btree_high_key_addr(cur, 1, block);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ hkey = xfs_btree_high_key_addr(cur, n, block);
+ if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
+ } else {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len);
+ }
+}
+
+/* Derive the keys for any btree block. */
+void
+xfs_btree_get_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ if (be16_to_cpu(block->bb_level) == 0)
+ xfs_btree_get_leaf_keys(cur, block, key);
+ else
+ xfs_btree_get_node_keys(cur, block, key);
+}
+
+/*
+ * Decide if we need to update the parent keys of a btree block. For
+ * a standard btree this is only necessary if we're updating the first
+ * record/key. For an overlapping btree, we must always update the
+ * keys because the highest key can be in any of the records or keys
+ * in the block.
+ */
+static inline bool
+xfs_btree_needs_key_update(
+ struct xfs_btree_cur *cur,
+ int ptr)
+{
+ return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+}
+
+/*
+ * Update the low and high parent keys of the given level, progressing
+ * towards the root. If force_all is false, stop if the keys for a given
+ * level do not need updating.
+ */
+STATIC int
+__xfs_btree_updkeys(
+ struct xfs_btree_cur *cur,
+ int level,
+ struct xfs_btree_block *block,
+ struct xfs_buf *bp0,
+ bool force_all)
+{
+ union xfs_btree_key key; /* keys from current level */
+ union xfs_btree_key *lkey; /* keys from the next level up */
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *nlkey; /* keys from the next level up */
+ union xfs_btree_key *nhkey;
+ struct xfs_buf *bp;
+ int ptr;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+
+ /* Exit if there aren't any parent levels to update. */
+ if (level + 1 >= cur->bc_nlevels)
+ return 0;
+
+ trace_xfs_btree_updkeys(cur, level, bp0);
+
+ lkey = &key;
+ hkey = xfs_btree_high_key_from_key(cur, lkey);
+ xfs_btree_get_keys(cur, block, lkey);
+ for (level++; level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_updkeys(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ return error;
+#endif
+ ptr = cur->bc_levels[level].ptr;
+ nlkey = xfs_btree_key_addr(cur, ptr, block);
+ nhkey = xfs_btree_high_key_addr(cur, ptr, block);
+ if (!force_all &&
+ !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
+ cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+ break;
+ xfs_btree_copy_keys(cur, nlkey, lkey, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ if (level + 1 >= cur->bc_nlevels)
+ break;
+ xfs_btree_get_node_keys(cur, block, lkey);
+ }
+
+ return 0;
+}
+
+/* Update all the keys from some level in cursor back to the root. */
+STATIC int
+xfs_btree_updkeys_force(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfs_buf *bp;
+ struct xfs_btree_block *block;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ return __xfs_btree_updkeys(cur, level, block, bp, true);
+}
+
+/*
+ * Update the parent keys of the given level, progressing towards the root.
+ */
+STATIC int
+xfs_btree_update_keys(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_key *kp;
+ union xfs_btree_key key;
+ int ptr;
+
+ ASSERT(level >= 0);
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ return __xfs_btree_updkeys(cur, level, block, bp, false);
+
+ /*
+ * Go up the tree from this level toward the root.
+ * At each level, update the key value to the value input.
+ * Stop when we reach a level where the cursor isn't pointing
+ * at the first entry in the block.
+ */
+ xfs_btree_get_keys(cur, block, &key);
+ for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ return error;
+#endif
+ ptr = cur->bc_levels[level].ptr;
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ }
+
+ return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ int error;
+ int ptr;
+ union xfs_btree_rec *rp;
+
+ /* Pick up the current block. */
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ goto error0;
+#endif
+ /* Get the address of the rec to be updated. */
+ ptr = cur->bc_levels[0].ptr;
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ /* Fill in the new contents and log them. */
+ xfs_btree_copy_recs(cur, rp, rec, 1);
+ xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, 0)) {
+ cur->bc_ops->update_lastrec(cur, block, rec,
+ ptr, LASTREC_UPDATE);
+ }
+
+ /* Pass new key value up to our parent. */
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, 0);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_lshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs; /* left record count */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ int rrecs; /* right record count */
+ union xfs_btree_ptr lptr; /* left btree pointer */
+ union xfs_btree_key *rkp = NULL; /* right btree key */
+ union xfs_btree_ptr *rpp = NULL; /* right address pointer */
+ union xfs_btree_rec *rrp = NULL; /* right record pointer */
+ int error; /* error return value */
+ int i;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1)
+ goto out0;
+
+ /* Set up variables for this block as "right". */
+ right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no left sibling then we can't shift an entry left. */
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &lptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ if (cur->bc_levels[level].ptr <= 1)
+ goto out0;
+
+ /* Set up the left neighbor as "left". */
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ rrecs = xfs_btree_get_numrecs(right);
+
+ /*
+ * We add one entry to the left side and remove one for the right side.
+ * Account for it here, the changes will be updated on disk and logged
+ * later.
+ */
+ lrecs++;
+ rrecs--;
+
+ XFS_BTREE_STATS_INC(cur, lshift);
+ XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+ /*
+ * If non-leaf, copy a key and a ptr to the left block.
+ * Log the changes to the left block.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+ error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
+ if (error)
+ goto error0;
+
+ xfs_btree_copy_keys(cur, lkp, rkp, 1);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+ xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur,
+ xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, 1);
+ xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->recs_inorder(cur,
+ xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+ }
+
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Slide the contents of right down one entry.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+ for (i = 0; i < rrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
+ if (error)
+ goto error0;
+ }
+
+ xfs_btree_shift_keys(cur,
+ xfs_btree_key_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_shift_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, right),
+ -1, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+ } else {
+ /* It's a leaf. operate on records */
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+ }
+
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the left.
+ */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ /* Update the parent high keys of the left block, if needed. */
+ error = xfs_btree_update_keys(tcur, level);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ }
+
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+
+ /* Slide the cursor value left one. */
+ cur->bc_levels[level].ptr--;
+
+ *stat = 1;
+ return 0;
+
+out0:
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+
+error1:
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_rshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ union xfs_btree_ptr rptr; /* right block pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ int rrecs; /* right record count */
+ int lrecs; /* left record count */
+ int error; /* error return value */
+ int i; /* loop counter */
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1))
+ goto out0;
+
+ /* Set up variables for this block as "left". */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no right sibling then we can't shift an entry right. */
+ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (cur->bc_levels[level].ptr >= lrecs)
+ goto out0;
+
+ /* Set up the right neighbor as "right". */
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, rshift);
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Make a hole at the start of the right neighbor block, then
+ * copy the last left block entry to the hole.
+ */
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+ union xfs_btree_ptr *rpp;
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+ for (i = rrecs - 1; i >= 0; i--) {
+ error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+ xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+ error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
+ if (error)
+ goto error0;
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_keys(cur, rkp, lkp, 1);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+ xfs_btree_key_addr(cur, 2, right)));
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *lrp;
+ union xfs_btree_rec *rrp;
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_recs(cur, rrp, lrp, 1);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+ }
+
+ /*
+ * Decrement and log left's numrecs, bump and log right's numrecs.
+ */
+ xfs_btree_set_numrecs(left, --lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, ++rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the right.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_lastrec(tcur, level);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error1;
+ }
+
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(tcur, level);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ *stat = 1;
+ return 0;
+
+out0:
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+
+error1:
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int /* error */
+__xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rrptr; /* right-right sibling ptr */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ int lrecs;
+ int rrecs;
+ int src_index;
+ int error; /* error return value */
+ int i;
+
+ XFS_BTREE_STATS_INC(cur, split);
+
+ /* Set up left block (current one). */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block as "right". */
+ error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* Fill in the btree header for the new right block. */
+ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+ /*
+ * Split the entries between the old and the new block evenly.
+ * Make sure that if there's an odd number of entries now, that
+ * each new block will have the same number of entries.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ rrecs = lrecs / 2;
+ if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
+ rrecs++;
+ src_index = (lrecs - rrecs + 1);
+
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /* Adjust numrecs for the later get_*_keys() calls. */
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+ /*
+ * Copy btree block entries from the left block over to the
+ * new block, the right. Update the right block and log the
+ * changes.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, src_index, left);
+ lpp = xfs_btree_ptr_addr(cur, src_index, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+ for (i = src_index; i < rrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ /* Copy the keys & pointers to the new block. */
+ xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_node_keys(cur, right, key);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, src_index, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ /* Copy records to the new block. */
+ xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_leaf_keys(cur, right, key);
+ }
+
+ /*
+ * Find the left block number by looking in the buffer.
+ * Adjust sibling pointers.
+ */
+ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+ xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /*
+ * If there's a block to the new block's right, make that block
+ * point back to right instead of to left.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+ error = xfs_btree_read_buf_block(cur, &rrptr,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If the cursor is really in the right block, move it there.
+ * If it's just pointing past the last entry in left, then we'll
+ * insert there, so don't change anything in that case.
+ */
+ if (cur->bc_levels[level].ptr > lrecs + 1) {
+ xfs_btree_setbuf(cur, level, rbp);
+ cur->bc_levels[level].ptr -= lrecs;
+ }
+ /*
+ * If there are more levels, we'll need another cursor which refers
+ * the right block, no matter where this cursor was.
+ */
+ if (level + 1 < cur->bc_nlevels) {
+ error = xfs_btree_dup_cursor(cur, curp);
+ if (error)
+ goto error0;
+ (*curp)->bc_levels[level + 1].ptr++;
+ }
+ *ptrp = rptr;
+ *stat = 1;
+ return 0;
+out0:
+ *stat = 0;
+ return 0;
+
+error0:
+ return error;
+}
+
+#ifdef __KERNEL__
+struct xfs_btree_split_args {
+ struct xfs_btree_cur *cur;
+ int level;
+ union xfs_btree_ptr *ptrp;
+ union xfs_btree_key *key;
+ struct xfs_btree_cur **curp;
+ int *stat; /* success/failure */
+ int result;
+ bool kswapd; /* allocation in kswapd context */
+ struct completion *done;
+ struct work_struct work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+ struct work_struct *work)
+{
+ struct xfs_btree_split_args *args = container_of(work,
+ struct xfs_btree_split_args, work);
+ unsigned long pflags;
+ unsigned long new_pflags = 0;
+
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
+
+ args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+ args->key, args->curp, args->stat);
+
+ xfs_trans_clear_context(args->cur->bc_tp);
+ current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
+ complete(args->done);
+
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_split_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+ args.cur = cur;
+ args.level = level;
+ args.ptrp = ptrp;
+ args.key = key;
+ args.curp = curp;
+ args.stat = stat;
+ args.done = &done;
+ args.kswapd = current_is_kswapd();
+ INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+ queue_work(xfs_alloc_wq, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+ return args.result;
+}
+#else
+#define xfs_btree_split __xfs_btree_split
+#endif /* __KERNEL__ */
+
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int /* error */
+xfs_btree_new_iroot(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *logflags, /* logging flags for inode */
+ int *stat) /* return status - 0 fail */
+{
+ struct xfs_buf *cbp; /* buffer for cblock */
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_btree_block *cblock; /* child btree block */
+ union xfs_btree_key *ckp; /* child key pointer */
+ union xfs_btree_ptr *cpp; /* child ptr pointer */
+ union xfs_btree_key *kp; /* pointer to btree key */
+ union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr nptr; /* new block addr */
+ int level; /* btree level */
+ int error; /* error return code */
+ int i; /* loop counter */
+
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ level = cur->bc_nlevels - 1;
+
+ block = xfs_btree_get_iroot(cur);
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ return 0;
+
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Copy the root into a real block. */
+ error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
+ if (error)
+ goto error0;
+
+ /*
+ * we can't just memcpy() the root in for CRC enabled btree blocks.
+ * In that case have to also ensure the blkno remains correct
+ */
+ memcpy(cblock, block, xfs_btree_block_len(cur));
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ cblock->bb_u.l.bb_blkno = bno;
+ else
+ cblock->bb_u.s.bb_blkno = bno;
+ }
+
+ be16_add_cpu(&block->bb_level, 1);
+ xfs_btree_set_numrecs(block, 1);
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ cur->bc_levels[level + 1].ptr = 1;
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+ for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+ error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+ error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
+ if (error)
+ goto error0;
+
+ xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+ xfs_iroot_realloc(cur->bc_ino.ip,
+ 1 - xfs_btree_get_numrecs(cblock),
+ cur->bc_ino.whichfork);
+
+ xfs_btree_setbuf(cur, level, cbp);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+ xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+ *logflags |=
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
+ *stat = 1;
+ return 0;
+error0:
+ return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int /* error */
+xfs_btree_new_root(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* one half of the old root block */
+ struct xfs_buf *bp; /* buffer containing block */
+ int error; /* error return value */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *nbp; /* new (root) buffer */
+ struct xfs_btree_block *new; /* new (root) btree block */
+ int nptr; /* new value for key index, 1 or 2 */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rptr;
+ union xfs_btree_ptr lptr;
+
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ /* initialise our start point from the cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block. */
+ error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
+ if (error)
+ goto error0;
+
+ /* Set the root in the holding structure increasing the level by 1. */
+ cur->bc_ops->set_root(cur, &lptr, 1);
+
+ /*
+ * At the previous root level there are now two blocks: the old root,
+ * and the new block generated when it was split. We don't know which
+ * one the cursor is pointing at, so we set up variables "left" and
+ * "right" for each case.
+ */
+ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /* Our block is left, pick up the right block. */
+ lbp = bp;
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+ left = block;
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+ bp = rbp;
+ nptr = 1;
+ } else {
+ /* Our block is right, pick up the left block. */
+ rbp = bp;
+ xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+ right = block;
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+ bp = lbp;
+ nptr = 2;
+ }
+
+ /* Fill in the new block's btree header and log it. */
+ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+ xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+ ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+ !xfs_btree_ptr_is_null(cur, &rptr));
+
+ /* Fill in the key data in the new root. */
+ if (xfs_btree_get_level(left) > 0) {
+ /*
+ * Get the keys for the left block's keys and put them directly
+ * in the parent block. Do the same for the right block.
+ */
+ xfs_btree_get_node_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_node_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
+ } else {
+ /*
+ * Get the keys for the left block's records and put them
+ * directly in the parent block. Do the same for the right
+ * block.
+ */
+ xfs_btree_get_leaf_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_leaf_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
+ }
+ xfs_btree_log_keys(cur, nbp, 1, 2);
+
+ /* Fill in the pointer data in the new root. */
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+ xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+ /* Fix up the cursor. */
+ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+ cur->bc_levels[cur->bc_nlevels].ptr = nptr;
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ *stat = 1;
+ return 0;
+error0:
+ return error;
+out0:
+ *stat = 0;
+ return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* btree level */
+ int numrecs,/* # of recs in block */
+ int *oindex,/* old tree index */
+ int *index, /* new tree index */
+ union xfs_btree_ptr *nptr, /* new btree ptr */
+ struct xfs_btree_cur **ncur, /* new btree cursor */
+ union xfs_btree_key *key, /* key of new block */
+ int *stat)
+{
+ int error = 0;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_inode *ip = cur->bc_ino.ip;
+
+ if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+ /* A root block that can be made bigger. */
+ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+ *stat = 1;
+ } else {
+ /* A root block that needs replacing */
+ int logflags = 0;
+
+ error = xfs_btree_new_iroot(cur, &logflags, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+ }
+
+ return 0;
+ }
+
+ /* First, try shifting an entry to the right neighbor. */
+ error = xfs_btree_rshift(cur, level, stat);
+ if (error || *stat)
+ return error;
+
+ /* Next, try shifting an entry to the left neighbor. */
+ error = xfs_btree_lshift(cur, level, stat);
+ if (error)
+ return error;
+
+ if (*stat) {
+ *oindex = *index = cur->bc_levels[level].ptr;
+ return 0;
+ }
+
+ /*
+ * Next, try splitting the current block in half.
+ *
+ * If this works we have to re-set our variables because we
+ * could be in a different block now.
+ */
+ error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
+ if (error || *stat == 0)
+ return error;
+
+
+ *index = cur->bc_levels[level].ptr;
+ return 0;
+}
+
+/*
+ * Insert one record/level. Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level to insert record at */
+ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
+ union xfs_btree_rec *rec, /* record to insert */
+ union xfs_btree_key *key, /* i/o: block key for ptrp */
+ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer for block */
+ union xfs_btree_ptr nptr; /* new block ptr */
+ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
+ union xfs_btree_key nkey; /* new block key */
+ union xfs_btree_key *lkey;
+ int optr; /* old key/record index */
+ int ptr; /* key/record index */
+ int numrecs;/* number of records */
+ int error; /* error return value */
+ int i;
+ xfs_daddr_t old_bn;
+
+ ncur = NULL;
+ lkey = &nkey;
+
+ /*
+ * If we have an external root pointer, and we've made it to the
+ * root level, allocate a new root block and we're done.
+ */
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level >= cur->bc_nlevels)) {
+ error = xfs_btree_new_root(cur, stat);
+ xfs_btree_set_ptr_null(cur, ptrp);
+
+ return error;
+ }
+
+ /* If we're off the left edge, return failure. */
+ ptr = cur->bc_levels[level].ptr;
+ if (ptr == 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ optr = ptr;
+
+ XFS_BTREE_STATS_INC(cur, insrec);
+
+ /* Get pointers to the btree buffer and block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+
+ /* Check that the new entry is being inserted in the right place. */
+ if (ptr <= numrecs) {
+ if (level == 0) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, rec,
+ xfs_btree_rec_addr(cur, ptr, block)));
+ } else {
+ ASSERT(cur->bc_ops->keys_inorder(cur, key,
+ xfs_btree_key_addr(cur, ptr, block)));
+ }
+ }
+#endif
+
+ /*
+ * If the block is full, we can't insert the new entry until we
+ * make the block un-full.
+ */
+ xfs_btree_set_ptr_null(cur, &nptr);
+ if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+ error = xfs_btree_make_block_unfull(cur, level, numrecs,
+ &optr, &ptr, &nptr, &ncur, lkey, stat);
+ if (error || *stat == 0)
+ goto error0;
+ }
+
+ /*
+ * The current block may have changed if the block was
+ * previously full and we have just made space in it.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /*
+ * At this point we know there's room for our new entry in the block
+ * we're pointing at.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+ for (i = numrecs - ptr; i >= 0; i--) {
+ error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+ xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+ error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
+ if (error)
+ goto error0;
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_keys(cur, kp, key, 1);
+ xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+ numrecs++;
+ xfs_btree_set_numrecs(block, numrecs);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+ xfs_btree_key_addr(cur, ptr + 1, block)));
+ }
+#endif
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *rp;
+
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_recs(cur, rp, rec, 1);
+ xfs_btree_set_numrecs(block, ++numrecs);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+ xfs_btree_rec_addr(cur, ptr + 1, block)));
+ }
+#endif
+ }
+
+ /* Log the new number of records in the btree header. */
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /*
+ * If we just inserted into a new tree block, we have to
+ * recalculate nkey here because nkey is out of date.
+ *
+ * Otherwise we're just updating an existing block (having shoved
+ * some records into the new tree block), so use the regular key
+ * update mechanism.
+ */
+ if (bp && xfs_buf_daddr(bp) != old_bn) {
+ xfs_btree_get_keys(cur, block, lkey);
+ } else if (xfs_btree_needs_key_update(cur, optr)) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, rec,
+ ptr, LASTREC_INSREC);
+ }
+
+ /*
+ * Return the new block number, if any.
+ * If there is one, give back a record value and a cursor too.
+ */
+ *ptrp = nptr;
+ if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+ xfs_btree_copy_keys(cur, key, lkey, 1);
+ *curp = ncur;
+ }
+
+ *stat = 1;
+ return 0;
+
+error0:
+ if (ncur)
+ xfs_btree_del_cursor(ncur, error);
+ return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+ struct xfs_btree_cur *cur,
+ int *stat)
+{
+ int error; /* error return value */
+ int i; /* result value, 0 for failure */
+ int level; /* current level number in btree */
+ union xfs_btree_ptr nptr; /* new block number (split result) */
+ struct xfs_btree_cur *ncur; /* new cursor (split result) */
+ struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_key bkey; /* key of block to insert */
+ union xfs_btree_key *key;
+ union xfs_btree_rec rec; /* record to insert */
+
+ level = 0;
+ ncur = NULL;
+ pcur = cur;
+ key = &bkey;
+
+ xfs_btree_set_ptr_null(cur, &nptr);
+
+ /* Make a key out of the record data to be inserted, and save it. */
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(key, &rec);
+
+ /*
+ * Loop going up the tree, starting at the leaf level.
+ * Stop when we don't get a split block, that must mean that
+ * the insert is finished with this level.
+ */
+ do {
+ /*
+ * Insert nrec/nptr into this level of the tree.
+ * Note if we fail, nptr will be null.
+ */
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
+ &ncur, &i);
+ if (error) {
+ if (pcur != cur)
+ xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ level++;
+
+ /*
+ * See if the cursor we just used is trash.
+ * Can't trash the caller's cursor, but otherwise we should
+ * if ncur is a new cursor or we're about to be done.
+ */
+ if (pcur != cur &&
+ (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+ /* Save the state from the cursor before we trash it */
+ if (cur->bc_ops->update_cursor)
+ cur->bc_ops->update_cursor(pcur, cur);
+ cur->bc_nlevels = pcur->bc_nlevels;
+ xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+ }
+ /* If we got a new cursor, switch to it. */
+ if (ncur) {
+ pcur = ncur;
+ ncur = NULL;
+ }
+ } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+ *stat = i;
+ return 0;
+error0:
+ return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block. But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+ struct xfs_btree_cur *cur)
+{
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *cblock;
+ union xfs_btree_key *kp;
+ union xfs_btree_key *ckp;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_ptr *cpp;
+ struct xfs_buf *cbp;
+ int level;
+ int index;
+ int numrecs;
+ int error;
+#ifdef DEBUG
+ union xfs_btree_ptr ptr;
+#endif
+ int i;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_nlevels > 1);
+
+ /*
+ * Don't deal with the root block needs to be a leaf case.
+ * We're just going to turn the thing back into extents anyway.
+ */
+ level = cur->bc_nlevels - 1;
+ if (level == 1)
+ goto out0;
+
+ /*
+ * Give up if the root has multiple children.
+ */
+ block = xfs_btree_get_iroot(cur);
+ if (xfs_btree_get_numrecs(block) != 1)
+ goto out0;
+
+ cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+ numrecs = xfs_btree_get_numrecs(cblock);
+
+ /*
+ * Only do this if the next level will fit.
+ * Then the data must be copied up to the inode,
+ * instead of freeing the root you free the next level.
+ */
+ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+ index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+ if (index) {
+ xfs_iroot_realloc(cur->bc_ino.ip, index,
+ cur->bc_ino.whichfork);
+ block = ifp->if_broot;
+ }
+
+ be16_add_cpu(&block->bb_numrecs, index);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+
+ for (i = 0; i < numrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+ if (error)
+ return error;
+ }
+
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ error = xfs_btree_free_block(cur, cbp);
+ if (error)
+ return error;
+
+ cur->bc_levels[level - 1].bp = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ xfs_trans_log_inode(cur->bc_tp, ip,
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ cur->bc_nlevels--;
+out0:
+ return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ union xfs_btree_ptr *newroot)
+{
+ int error;
+
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+ /*
+ * Update the root pointer, decreasing the level by 1 and then
+ * free the old root.
+ */
+ cur->bc_ops->set_root(cur, newroot, -1);
+
+ error = xfs_btree_free_block(cur, bp);
+ if (error)
+ return error;
+
+ cur->bc_levels[level].bp = NULL;
+ cur->bc_levels[level].ra = 0;
+ cur->bc_nlevels--;
+
+ return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat)
+{
+ int error;
+ int i;
+
+ if (level > 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ return error;
+ }
+
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int /* error */
+xfs_btree_delrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level removing record from */
+ int *stat) /* fail/done/go-on */
+{
+ struct xfs_btree_block *block; /* btree block */
+ union xfs_btree_ptr cptr; /* current block ptr */
+ struct xfs_buf *bp; /* buffer for block */
+ int error; /* error return value */
+ int i; /* loop counter */
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs = 0; /* left record count */
+ int ptr; /* key/record index */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ int rrecs = 0; /* right record count */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ int numrecs; /* temporary numrec count */
+
+ tcur = NULL;
+
+ /* Get the index of the entry being deleted, check for nothing there. */
+ ptr = cur->bc_levels[level].ptr;
+ if (ptr == 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ /* Get the buffer & block containing the record or key/ptr. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we're off the end of the block. */
+ if (ptr > numrecs) {
+ *stat = 0;
+ return 0;
+ }
+
+ XFS_BTREE_STATS_INC(cur, delrec);
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+ /* Excise the entries being deleted. */
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+
+ lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+ lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+ for (i = 0; i < numrecs - ptr; i++) {
+ error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ if (ptr < numrecs) {
+ xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+ xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+ }
+ } else {
+ /* It's a leaf. operate on records */
+ if (ptr < numrecs) {
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, ptr + 1, block),
+ -1, numrecs - ptr);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+ }
+ }
+
+ /*
+ * Decrement and log the number of entries in the block.
+ */
+ xfs_btree_set_numrecs(block, --numrecs);
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, NULL,
+ ptr, LASTREC_DELREC);
+ }
+
+ /*
+ * We're at the root level. First, shrink the root block in-memory.
+ * Try to get rid of the next level down. If we can't then there's
+ * nothing left to do.
+ */
+ if (level == cur->bc_nlevels - 1) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ xfs_iroot_realloc(cur->bc_ino.ip, -1,
+ cur->bc_ino.whichfork);
+
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If this is the root level, and there's only one entry left,
+ * and it's NOT the leaf level, then we can get rid of this
+ * level.
+ */
+ if (numrecs == 1 && level > 0) {
+ union xfs_btree_ptr *pp;
+ /*
+ * pp is still set to the first pointer in the block.
+ * Make it the new root of the btree.
+ */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ error = xfs_btree_kill_root(cur, bp, level, pp);
+ if (error)
+ goto error0;
+ } else if (level > 0) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ }
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If we deleted the leftmost entry in the block, update the
+ * key values above us in the tree.
+ */
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If the number of records remaining in the block is at least
+ * the minimum, we're done.
+ */
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ /*
+ * Otherwise, we have to move some records around to keep the
+ * tree balanced. Look at the left and right sibling blocks to
+ * see if we can re-balance by moving only one record.
+ */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * One child of root, need to get a chance to copy its contents
+ * into the root and delete it. Can't go up to next level,
+ * there's nothing to delete there.
+ */
+ if (xfs_btree_ptr_is_null(cur, &rptr) &&
+ xfs_btree_ptr_is_null(cur, &lptr) &&
+ level == cur->bc_nlevels - 2) {
+ error = xfs_btree_kill_iroot(cur);
+ if (!error)
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+ !xfs_btree_ptr_is_null(cur, &lptr));
+
+ /*
+ * Duplicate the cursor so our btree manipulations here won't
+ * disrupt the next level up.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * If there's a right sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /*
+ * Move the temp cursor to the last entry in the next block.
+ * Actually any entry but the first would suffice.
+ */
+ i = xfs_btree_lastrec(tcur, level);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ i = xfs_btree_lastrec(tcur, level);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ /* Grab a pointer to the block. */
+ right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(tcur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+ /*
+ * If right block is full enough so that removing one entry
+ * won't make it too empty, and left-shifting an entry out
+ * of right to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(right) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_lshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference, and fix up the temp cursor to point
+ * to our block again (last record).
+ */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ i = xfs_btree_firstrec(tcur, level);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+ }
+
+ /*
+ * If there's a left sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ /*
+ * Move the temp cursor to the first entry in the
+ * previous block.
+ */
+ i = xfs_btree_firstrec(tcur, level);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ /* Grab a pointer to the block. */
+ left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+ /*
+ * If left block is full enough so that removing one entry
+ * won't make it too empty, and right-shifting an entry out
+ * of left to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(left) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_rshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+ if (level == 0)
+ cur->bc_levels[0].ptr++;
+
+ *stat = 1;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ }
+
+ /* Delete the temp cursor, we're done with it. */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ /* If here, we need to do a join to keep the tree balanced. */
+ ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+ if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+ lrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "right" to be the starting block,
+ * "left" to be the left neighbor.
+ */
+ rptr = cptr;
+ right = block;
+ rbp = bp;
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /*
+ * If that won't work, see if we can join with the right neighbor block.
+ */
+ } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+ rrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "left" to be the starting block,
+ * "right" to be the right neighbor.
+ */
+ lptr = cptr;
+ left = block;
+ lbp = bp;
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /*
+ * Otherwise, we can't fix the imbalance.
+ * Just return. This is probably a logic error, but it's not fatal.
+ */
+ } else {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ rrecs = xfs_btree_get_numrecs(right);
+ lrecs = xfs_btree_get_numrecs(left);
+
+ /*
+ * We're now going to join "left" and "right" by moving all the stuff
+ * in "right" to "left" and deleting "right".
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+ for (i = 1; i < rrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+
+ xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+ xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+ xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ }
+
+ XFS_BTREE_STATS_INC(cur, join);
+
+ /*
+ * Fix up the number of records and right block pointer in the
+ * surviving block, and log it.
+ */
+ xfs_btree_set_numrecs(left, lrecs + rrecs);
+ xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /* If there is a right sibling, point it to the remaining block. */
+ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+
+ /* Free the deleted block. */
+ error = xfs_btree_free_block(cur, rbp);
+ if (error)
+ goto error0;
+
+ /*
+ * If we joined with the left neighbor, set the buffer in the
+ * cursor to the left block, and fix up the index.
+ */
+ if (bp != lbp) {
+ cur->bc_levels[level].bp = lbp;
+ cur->bc_levels[level].ptr += lrecs;
+ cur->bc_levels[level].ra = 0;
+ }
+ /*
+ * If we joined with the right neighbor and there's a level above
+ * us, increment the cursor at that level.
+ */
+ else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+ (level + 1 < cur->bc_nlevels)) {
+ error = xfs_btree_increment(cur, level + 1, &i);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * Readjust the ptr at this level if it's not a leaf, since it's
+ * still pointing at the deletion point, which makes the cursor
+ * inconsistent. If this makes the ptr 0, the caller fixes it up.
+ * We can't use decrement because it would change the next level up.
+ */
+ if (level > 0)
+ cur->bc_levels[level].ptr--;
+
+ /*
+ * We combined blocks, so we have to update the parent keys if the
+ * btree supports overlapped intervals. However,
+ * bc_levels[level + 1].ptr points to the old block so that the caller
+ * knows which record to delete. Therefore, the caller must be savvy
+ * enough to call updkeys for us if we return stat == 2. The other
+ * exit points from this function don't require deletions further up
+ * the tree, so they can call updkeys directly.
+ */
+
+ /* Return value means the next level up has something to do. */
+ *stat = 2;
+ return 0;
+
+error0:
+ if (tcur)
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int /* error */
+xfs_btree_delete(
+ struct xfs_btree_cur *cur,
+ int *stat) /* success/failure */
+{
+ int error; /* error return value */
+ int level;
+ int i;
+ bool joined = false;
+
+ /*
+ * Go up the tree, starting at leaf level.
+ *
+ * If 2 is returned then a join was done; go to the next level.
+ * Otherwise we are done.
+ */
+ for (level = 0, i = 2; i == 2; level++) {
+ error = xfs_btree_delrec(cur, level, &i);
+ if (error)
+ goto error0;
+ if (i == 2)
+ joined = true;
+ }
+
+ /*
+ * If we combined blocks as part of deleting the record, delrec won't
+ * have updated the parent high keys so we have to do that here.
+ */
+ if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ error = xfs_btree_updkeys_force(cur, 0);
+ if (error)
+ goto error0;
+ }
+
+ if (i == 0) {
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ if (cur->bc_levels[level].ptr == 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ goto error0;
+ break;
+ }
+ }
+ }
+
+ *stat = i;
+ return 0;
+error0:
+ return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_btree_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_rec **recp, /* output: btree record */
+ int *stat) /* output: success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer pointer */
+ int ptr; /* record number */
+#ifdef DEBUG
+ int error; /* error return value */
+#endif
+
+ ptr = cur->bc_levels[0].ptr;
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * Off the right end or left end, return failure.
+ */
+ if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ /*
+ * Point to the record and extract its data.
+ */
+ *recp = xfs_btree_rec_addr(cur, ptr, block);
+ *stat = 1;
+ return 0;
+}
+
+/* Visit a block in a btree. */
+STATIC int
+xfs_btree_visit_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_btree_visit_blocks_fn fn,
+ void *data)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+ int error;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* process the block */
+ error = fn(cur, level, data);
+ if (error)
+ return error;
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return -ENOENT;
+
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+
+/* Visit every block in a btree. */
+int
+xfs_btree_visit_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn,
+ unsigned int flags,
+ void *data)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
+
+ if (!(flags & XFS_BTREE_VISIT_LEAVES))
+ continue;
+ } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
+ continue;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_visit_block(cur, level, fn, data);
+ } while (!error);
+
+ if (error != -ENOENT)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction. If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+struct xfs_btree_block_change_owner_info {
+ uint64_t new_owner;
+ struct list_head *buffer_list;
+};
+
+static int
+xfs_btree_block_change_owner(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *data)
+{
+ struct xfs_btree_block_change_owner_info *bbcoi = data;
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
+ return 0;
+ block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+ } else {
+ if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
+ return 0;
+ block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
+ }
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+ * buffer pointer here and we shouldn't attempt to log the change as the
+ * information is already held in the inode and discarded when the root
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+ if (!bp) {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ return 0;
+ }
+
+ if (cur->bc_tp) {
+ if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
+ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+ return -EAGAIN;
+ }
+ } else {
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ }
+
+ return 0;
+}
+
+int
+xfs_btree_change_owner(
+ struct xfs_btree_cur *cur,
+ uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_block_change_owner_info bbcoi;
+
+ bbcoi.new_owner = new_owner;
+ bbcoi.buffer_list = buffer_list;
+
+ return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
+ XFS_BTREE_VISIT_ALL, &bbcoi);
+}
+
+/* Verify the v5 fields of a long-format btree block. */
+xfs_failaddr_t
+xfs_btree_lblock_v5hdr_verify(
+ struct xfs_buf *bp,
+ uint64_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (!xfs_has_crc(mp))
+ return __this_address;
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+ return __this_address;
+ if (owner != XFS_RMAP_OWN_UNKNOWN &&
+ be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+ return __this_address;
+ return NULL;
+}
+
+/* Verify a long-format btree block. */
+xfs_failaddr_t
+xfs_btree_lblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return __this_address;
+
+ /* sibling pointer verification */
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
+}
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * btree block
+ *
+ * @bp: buffer containing the btree block
+ */
+xfs_failaddr_t
+xfs_btree_sblock_v5hdr_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+
+ if (!xfs_has_crc(mp))
+ return __this_address;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+ return __this_address;
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return __this_address;
+ return NULL;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+xfs_failaddr_t
+xfs_btree_sblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return __this_address;
+
+ /* sibling pointer verification */
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
+ */
+unsigned int
+xfs_btree_compute_maxlevels(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned int height = 1;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ height++;
+ }
+
+ return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned long long blocks = level_blocks;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ blocks += level_blocks;
+ }
+
+ return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+ const unsigned int *limits,
+ unsigned long long leaf_blocks)
+{
+ unsigned long long node_blocks = limits[1];
+ unsigned long long blocks_left = leaf_blocks - 1;
+ unsigned int height = 1;
+
+ if (leaf_blocks < 1)
+ return 0;
+
+ while (node_blocks < blocks_left) {
+ blocks_left -= node_blocks;
+ node_blocks *= limits[1];
+ height++;
+ }
+
+ return height;
+}
+
+/*
+ * Query a regular btree for all records overlapping a given interval.
+ * Start with a LE lookup of the key of low_rec and return all records
+ * until we find a record with a key greater than the key of high_rec.
+ */
+STATIC int
+xfs_btree_simple_query_range(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec *recp;
+ union xfs_btree_key rec_key;
+ int64_t diff;
+ int stat;
+ bool firstrec = true;
+ int error;
+
+ ASSERT(cur->bc_ops->init_high_key_from_rec);
+ ASSERT(cur->bc_ops->diff_two_keys);
+
+ /*
+ * Find the leftmost record. The btree cursor must be set
+ * to the low record used to generate low_key.
+ */
+ stat = 0;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ goto out;
+
+ /* Nothing? See if there's anything to the right. */
+ if (!stat) {
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ goto out;
+ }
+
+ while (stat) {
+ /* Find the record. */
+ error = xfs_btree_get_rec(cur, &recp, &stat);
+ if (error || !stat)
+ break;
+
+ /* Skip if high_key(rec) < low_key. */
+ if (firstrec) {
+ cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
+ firstrec = false;
+ diff = cur->bc_ops->diff_two_keys(cur, low_key,
+ &rec_key);
+ if (diff > 0)
+ goto advloop;
+ }
+
+ /* Stop if high_key < low_key(rec). */
+ cur->bc_ops->init_key_from_rec(&rec_key, recp);
+ diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
+ if (diff > 0)
+ break;
+
+ /* Callback */
+ error = fn(cur, recp, priv);
+ if (error)
+ break;
+
+advloop:
+ /* Move on to the next record. */
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ break;
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Query an overlapped interval btree for all records overlapping a given
+ * interval. This function roughly follows the algorithm given in
+ * "Interval Trees" of _Introduction to Algorithms_, which is section
+ * 14.3 in the 2nd and 3rd editions.
+ *
+ * First, generate keys for the low and high records passed in.
+ *
+ * For any leaf node, generate the high and low keys for the record.
+ * If the record keys overlap with the query low/high keys, pass the
+ * record to the function iterator.
+ *
+ * For any internal node, compare the low and high keys of each
+ * pointer against the query low/high keys. If there's an overlap,
+ * follow the pointer.
+ *
+ * As an optimization, we stop scanning a block when we find a low key
+ * that is greater than the query's high key.
+ */
+STATIC int
+xfs_btree_overlapped_query_range(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_key rec_key;
+ union xfs_btree_key rec_hkey;
+ union xfs_btree_key *lkp;
+ union xfs_btree_key *hkp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int64_t ldiff;
+ int64_t hdiff;
+ int level;
+ struct xfs_buf *bp;
+ int i;
+ int error;
+
+ /* Load the root of the btree. */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
+ if (error)
+ return error;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_levels[level].ptr = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
+pop_up:
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_levels[level + 1].ptr++;
+ level++;
+ continue;
+ }
+
+ if (level == 0) {
+ /* Handle a leaf node. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
+
+ cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
+ ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
+ low_key);
+
+ cur->bc_ops->init_key_from_rec(&rec_key, recp);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
+ &rec_key);
+
+ /*
+ * If (record's high key >= query's low key) and
+ * (query's high key >= record's low key), then
+ * this record overlaps the query range; callback.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ error = fn(cur, recp, priv);
+ if (error)
+ break;
+ } else if (hdiff < 0) {
+ /* Record is larger than high key; pop. */
+ goto pop_up;
+ }
+ cur->bc_levels[level].ptr++;
+ continue;
+ }
+
+ /* Handle an internal node. */
+ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+ block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+
+ ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
+
+ /*
+ * If (pointer's high key >= query's low key) and
+ * (query's high key >= pointer's low key), then
+ * this record overlaps the query range; follow pointer.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ level--;
+ error = xfs_btree_lookup_get_block(cur, level, pp,
+ &block);
+ if (error)
+ goto out;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_levels[level].ptr = 1;
+ continue;
+ } else if (hdiff < 0) {
+ /* The low key is larger than the upper range; pop. */
+ goto pop_up;
+ }
+ cur->bc_levels[level].ptr++;
+ }
+
+out:
+ /*
+ * If we don't end this function with the cursor pointing at a record
+ * block, a subsequent non-error cursor deletion will not release
+ * node-level buffers, causing a buffer leak. This is quite possible
+ * with a zero-results range query, so release the buffers if we
+ * failed to return any results.
+ */
+ if (cur->bc_levels[0].bp == NULL) {
+ for (i = 0; i < cur->bc_nlevels; i++) {
+ if (cur->bc_levels[i].bp) {
+ xfs_trans_brelse(cur->bc_tp,
+ cur->bc_levels[i].bp);
+ cur->bc_levels[i].bp = NULL;
+ cur->bc_levels[i].ptr = 0;
+ cur->bc_levels[i].ra = 0;
+ }
+ }
+ }
+
+ return error;
+}
+
+/*
+ * Query a btree for all records overlapping a given interval of keys. The
+ * supplied function will be called with each record found; return one of the
+ * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
+ * code. This function returns -ECANCELED, zero, or a negative error code.
+ */
+int
+xfs_btree_query_range(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec rec;
+ union xfs_btree_key low_key;
+ union xfs_btree_key high_key;
+
+ /* Find the keys of both ends of the interval. */
+ cur->bc_rec = *high_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&high_key, &rec);
+
+ cur->bc_rec = *low_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&low_key, &rec);
+
+ /* Enforce low key < high key. */
+ if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+ return -EINVAL;
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return xfs_btree_simple_query_range(cur, &low_key,
+ &high_key, fn, priv);
+ return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
+ fn, priv);
+}
+
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_key low_key;
+ union xfs_btree_key high_key;
+
+ memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+ memset(&low_key, 0, sizeof(low_key));
+ memset(&high_key, 0xFF, sizeof(high_key));
+
+ return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
+}
+
+static int
+xfs_btree_count_blocks_helper(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *data)
+{
+ xfs_extlen_t *blocks = data;
+ (*blocks)++;
+
+ return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_extlen_t *blocks)
+{
+ *blocks = 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+ XFS_BTREE_VISIT_ALL, blocks);
+}
+
+/* Compare two btree pointers. */
+int64_t
+xfs_btree_diff_two_ptrs(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
+ return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+}
+
+/* If there's an extent, we're done. */
+STATIC int
+xfs_btree_has_record_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ return -ECANCELED;
+}
+
+/* Is there a record covering a given range of keys? */
+int
+xfs_btree_has_record(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high,
+ bool *exists)
+{
+ int error;
+
+ error = xfs_btree_query_range(cur, low, high,
+ &xfs_btree_has_record_helper, NULL);
+ if (error == -ECANCELED) {
+ *exists = true;
+ return 0;
+ }
+ *exists = false;
+ return error;
+}
+
+/* Are there more records in this btree? */
+bool
+xfs_btree_has_more_records(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+ /* There are still records in this block. */
+ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
+ return true;
+
+ /* There are more record blocks. */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
+ else
+ return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
+}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+ int error;
+
+ error = xfs_allocbt_init_cur_cache();
+ if (error)
+ return error;
+ error = xfs_inobt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_bmbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_refcountbt_init_cur_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_btree_destroy_cur_caches();
+ return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+ xfs_allocbt_destroy_cur_cache();
+ xfs_inobt_destroy_cur_cache();
+ xfs_bmbt_destroy_cur_cache();
+ xfs_rmapbt_destroy_cur_cache();
+ xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
new file mode 100644
index 000000000..eef27858a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -0,0 +1,606 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ifork;
+struct xfs_perag;
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+ __be32 s; /* short form ptr */
+ __be64 l; /* long form ptr */
+};
+
+/*
+ * The in-core btree key. Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both. The __*bigkey
+ * items should never be accessed directly.
+ */
+union xfs_btree_key {
+ struct xfs_bmbt_key bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ struct xfs_inobt_key inobt;
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key __rmap_bigkey[2];
+ struct xfs_refcount_key refc;
+};
+
+union xfs_btree_rec {
+ struct xfs_bmbt_rec bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ struct xfs_alloc_rec alloc;
+ struct xfs_inobt_rec inobt;
+ struct xfs_rmap_rec rmap;
+ struct xfs_refcount_rec refc;
+};
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
+#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
+
+uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
+#define XFS_BB_NUM_BITS 5
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_NUM_BITS_CRC 9
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
+
+/*
+ * Generic stats interface
+ */
+#define XFS_BTREE_STATS_INC(cur, stat) \
+ XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+
+struct xfs_btree_ops {
+ /* size of the key and record structures */
+ size_t key_len;
+ size_t rec_len;
+
+ /* cursor operations */
+ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+ void (*update_cursor)(struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst);
+
+ /* update btree root pointer */
+ void (*set_root)(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr, int level_change);
+
+ /* block allocation / freeing */
+ int (*alloc_block)(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat);
+ int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+ /* update last record information */
+ void (*update_lastrec)(struct xfs_btree_cur *cur,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
+ int ptr, int reason);
+
+ /* records in block/level */
+ int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+ int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* records on disk. Matter for the root in inode case. */
+ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* init values of btree structures */
+ void (*init_key_from_rec)(union xfs_btree_key *key,
+ const union xfs_btree_rec *rec);
+ void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec);
+ void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+ void (*init_high_key_from_rec)(union xfs_btree_key *key,
+ const union xfs_btree_rec *rec);
+
+ /* difference between key value and cursor value */
+ int64_t (*key_diff)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key);
+
+ /*
+ * Difference between key2 and key1 -- positive if key1 > key2,
+ * negative if key1 < key2, and zero if equal.
+ */
+ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2);
+
+ const struct xfs_buf_ops *buf_ops;
+
+ /* check that k1 is lower than k2 */
+ int (*keys_inorder)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2);
+
+ /* check that r1 is lower than r2 */
+ int (*recs_inorder)(struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2);
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
+union xfs_btree_irec {
+ struct xfs_alloc_rec_incore a;
+ struct xfs_bmbt_irec b;
+ struct xfs_inobt_rec_incore i;
+ struct xfs_rmap_irec r;
+ struct xfs_refcount_irec rc;
+};
+
+/* Per-AG btree information. */
+struct xfs_btree_cur_ag {
+ struct xfs_perag *pag;
+ union {
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ };
+ union {
+ struct {
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
+ } refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
+ };
+};
+
+/* Btree-in-inode cursor information */
+struct xfs_btree_cur_ino {
+ struct xfs_inode *ip;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ int allocated;
+ short forksize;
+ char whichfork;
+ char flags;
+/* We are converting a delalloc reservation */
+#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
+
+/* For extent swap, ignore owner check in verifier */
+#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
+};
+
+struct xfs_btree_level {
+ /* buffer pointer */
+ struct xfs_buf *bp;
+
+ /* key/record number */
+ uint16_t ptr;
+
+ /* readahead info */
+#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
+ uint16_t ra;
+};
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+struct xfs_btree_cur
+{
+ struct xfs_trans *bc_tp; /* transaction we're in, if any */
+ struct xfs_mount *bc_mp; /* file system mount struct */
+ const struct xfs_btree_ops *bc_ops;
+ struct kmem_cache *bc_cache; /* cursor cache */
+ unsigned int bc_flags; /* btree features - below */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
+ union xfs_btree_irec bc_rec; /* current insert/search record value */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ int bc_statoff; /* offset of btree stats array */
+
+ /*
+ * Short btree pointers need an agno to be able to turn the pointers
+ * into physical addresses for IO, so the btree cursor switches between
+ * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
+ * cursor.
+ */
+ union {
+ struct xfs_btree_cur_ag bc_ag;
+ struct xfs_btree_cur_ino bc_ino;
+ };
+
+ /* Must be at the end of the struct! */
+ struct xfs_btree_level bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height. The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
+#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata. The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING (1<<5)
+
+#define XFS_BTREE_NOERROR 0
+#define XFS_BTREE_ERROR 1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
+
+/*
+ * Internal long and short btree block checks. They return NULL if the
+ * block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp); /* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+bool /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_fsblock_t fsbno, /* btree block disk address */
+ int level); /* btree block level */
+
+/*
+ * Check that (short) pointer is ok.
+ */
+bool /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t agbno, /* btree block disk address */
+ int level); /* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error); /* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int /* error */
+xfs_btree_dup_cursor(
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur);/* output cursor */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+ uint32_t fields, /* bitmask of fields */
+ const short *offsets,/* table of field offsets */
+ int nbits, /* number of bits to inspect */
+ int *first, /* output: first byte offset */
+ int *last); /* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int /* error */
+xfs_btree_read_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void /* error */
+xfs_btree_reada_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void /* error */
+xfs_btree_reada_bufs(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
+
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_btnum_t btnum,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner);
+
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ xfs_btnum_t btnum,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags);
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
+ struct list_head *buffer_list);
+
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+ uint16_t numrecs)
+{
+ block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
+#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
+#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
+#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
+#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
+#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
+
+xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+ unsigned int max_recs);
+xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+ uint64_t owner);
+xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+ unsigned int max_recs);
+
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+ unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+ unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+ unsigned long long blocks);
+
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating. Any non-zero value will be
+ * passed up to the _query_range caller. The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
+typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec, void *priv);
+
+int xfs_btree_query_range(struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+ void *priv);
+
+typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
+ void *data);
+/* Visit record blocks. */
+#define XFS_BTREE_VISIT_RECORDS (1 << 0)
+/* Visit leaf blocks. */
+#define XFS_BTREE_VISIT_LEAVES (1 << 1)
+/* Visit all blocks. */
+#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \
+ XFS_BTREE_VISIT_LEAVES)
+int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
+
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
+union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
+ struct xfs_btree_block *block);
+int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
+ const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
+struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
+ int level, struct xfs_buf **bpp);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr);
+int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
+void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr, int lr);
+void xfs_btree_get_keys(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, union xfs_btree_key *key);
+union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
+int xfs_btree_has_record(struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high, bool *exists);
+bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
+
+/* Does this cursor point to the last block in the given level? */
+static inline bool
+xfs_btree_islastblock(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+ return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
+ struct xfs_buf **bpp);
+void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
+ int lr);
+void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp, int level, int numrecs);
+void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ const union xfs_btree_ptr *src_ptr, int numptrs);
+void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key, int numkeys);
+
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_btnum_t btnum,
+ uint8_t maxlevels,
+ struct kmem_cache *cache)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_maxlevels = maxlevels;
+ cur->bc_cache = cache;
+
+ return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
+#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
new file mode 100644
index 000000000..dd75e208b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+#include "xfs_btree_staging.h"
+
+/*
+ * Staging Cursors and Fake Roots for Btrees
+ * =========================================
+ *
+ * A staging btree cursor is a special type of btree cursor that callers must
+ * use to construct a new btree index using the btree bulk loader code. The
+ * bulk loading code uses the staging btree cursor to abstract the details of
+ * initializing new btree blocks and filling them with records or key/ptr
+ * pairs. Regular btree operations (e.g. queries and modifications) are not
+ * supported with staging cursors, and callers must not invoke them.
+ *
+ * Fake root structures contain all the information about a btree that is under
+ * construction by the bulk loading code. Staging btree cursors point to fake
+ * root structures instead of the usual AG header or inode structure.
+ *
+ * Callers are expected to initialize a fake root structure and pass it into
+ * the _stage_cursor function for a specific btree type. When bulk loading is
+ * complete, callers should call the _commit_staged_btree function for that
+ * specific btree type to commit the new btree into the filesystem.
+ */
+
+/*
+ * Don't allow staging cursors to be duplicated because they're supposed to be
+ * kept private to a single thread.
+ */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(0);
+ return NULL;
+}
+
+/*
+ * Don't allow block allocation for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Don't allow block freeing for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ */
+STATIC int
+xfs_btree_fakeroot_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xbtree_afakeroot *afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ afake = cur->bc_ag.afake;
+ ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/*
+ * Bulk Loading for AG Btrees
+ * ==========================
+ *
+ * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
+ * staging cursor. Callers should initialize this to zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_afakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/* Update the btree root information for a per-AG fake root. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ afake->af_root = be32_to_cpu(ptr->s);
+ afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.
+ * The btree cursor's bc_ops will be overridden as needed to make the staging
+ * functionality work.
+ */
+void
+xfs_btree_stage_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->set_root = xfs_btree_afakeroot_set_root;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ag.afake = afake;
+ cur->bc_nlevels = afake->af_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_afakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.agbp = agbp;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading for Inode-Rooted Btrees
+ * ====================================
+ *
+ * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
+ * the staging cursor. This structure should be initialized as follows:
+ *
+ * - if_fork_size field should be set to the number of bytes available to the
+ * fork in the inode.
+ *
+ * - if_fork should point to a freshly allocated struct xfs_ifork.
+ *
+ * - if_format should be set to the appropriate fork type (e.g.
+ * XFS_DINODE_FMT_BTREE).
+ *
+ * All other fields must be zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_ifakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root. The btree cursor's bc_ops will be overridden as needed to make the
+ * staging functionality work. If new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ino.ifake = ifake;
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+
+ if (new_ops)
+ *new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_ifakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ino.ifake = NULL;
+ cur->bc_ino.whichfork = whichfork;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single root block). When the creation is complete, the new root can be
+ * linked atomically into the filesystem by committing the staged cursor.
+ *
+ * Creation of a new btree proceeds roughly as follows:
+ *
+ * The first step is to initialize an appropriate fake btree root structure and
+ * then construct a staged btree cursor. Refer to the block comments about
+ * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
+ * more information about how to do this.
+ *
+ * The second step is to initialize a struct xfs_btree_bload context as
+ * documented in the structure definition.
+ *
+ * The third step is to call xfs_btree_bload_compute_geometry to compute the
+ * height of and the number of blocks needed to construct the btree. See the
+ * section "Computing the Geometry of the New Btree" for details about this
+ * computation.
+ *
+ * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
+ * save them for later use by ->claim_block(). Bulk loading requires all
+ * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
+ * rebuild, and to minimize seek distances of the new btree.
+ *
+ * Step five is to call xfs_btree_bload() to start constructing the btree.
+ *
+ * The final step is to commit the staging btree cursor, which logs the new
+ * btree root and turns the staging cursor into a regular cursor. The caller
+ * is responsible for cleaning up the previous btree blocks, if any.
+ *
+ * Computing the Geometry of the New Btree
+ * =======================================
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records
+ * in the bload structure. For node levels, the number of items for the level
+ * is the number of blocks in the next lower level of the tree. For each
+ * level, the desired number of items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = floor(nr_items / desired)
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs. The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb = nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division. If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * The height and number of fs blocks required to create the btree are computed
+ * and returned via btree_height and nr_blocks.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written to disk when bulk loading is finished.
+ */
+static void
+xfs_btree_bload_drop_buf(
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
+{
+ if (*bpp == NULL)
+ return;
+
+ if (!xfs_buf_delwri_queue(*bpp, buffers_list))
+ ASSERT(0);
+
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
+}
+
+/*
+ * Allocate and initialize one btree block for bulk loading.
+ *
+ * The new btree block will have its level and numrecs fields set to the values
+ * of the level and nr_this_block parameters, respectively.
+ *
+ * The caller should ensure that ptrp, bpp, and blockp refer to the left
+ * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
+ * will all point to the new block.
+ */
+STATIC int
+xfs_btree_bload_prep_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ unsigned int level,
+ unsigned int nr_this_block,
+ union xfs_btree_ptr *ptrp, /* in/out */
+ struct xfs_buf **bpp, /* in/out */
+ struct xfs_btree_block **blockp, /* in/out */
+ void *priv)
+{
+ union xfs_btree_ptr new_ptr;
+ struct xfs_buf *new_bp;
+ struct xfs_btree_block *new_block;
+ int ret;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ size_t new_size;
+
+ ASSERT(*bpp == NULL);
+
+ /* Allocate a new incore btree root block. */
+ new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot_bytes = (int)new_size;
+
+ /* Initialize it and send it out. */
+ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+ XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+ nr_this_block, cur->bc_ino.ip->i_ino,
+ cur->bc_flags);
+
+ *bpp = NULL;
+ *blockp = ifp->if_broot;
+ xfs_btree_set_ptr_null(cur, ptrp);
+ return 0;
+ }
+
+ /* Claim one of the caller's preallocated blocks. */
+ xfs_btree_set_ptr_null(cur, &new_ptr);
+ ret = bbl->claim_block(cur, &new_ptr, priv);
+ if (ret)
+ return ret;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+ if (ret)
+ return ret;
+
+ /*
+ * The previous block (if any) is the left sibling of the new block,
+ * so set its right sibling pointer to the new block and drop it.
+ */
+ if (*blockp)
+ xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+ xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ /* Initialize the new btree block. */
+ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+ xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+
+ /* Set the out parameters. */
+ *bpp = new_bp;
+ *blockp = new_block;
+ xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+ return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ xfs_btree_bload_get_record_fn get_record,
+ struct xfs_btree_block *block,
+ void *priv)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the leaf block with records. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_rec *block_rec;
+
+ ret = get_record(cur, priv);
+ if (ret)
+ return ret;
+ block_rec = xfs_btree_rec_addr(cur, j, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return 0;
+}
+
+/*
+ * Load one node block with key/ptr pairs.
+ *
+ * child_ptr must point to a block within the next level down in the tree. A
+ * key/ptr entry will be created in the new node block to the block pointed to
+ * by child_ptr. On exit, child_ptr points to the next block on the child
+ * level that needs processing.
+ */
+STATIC int
+xfs_btree_bload_node(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ union xfs_btree_ptr *child_ptr,
+ struct xfs_btree_block *block)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the node block with keys and pointers. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_key child_key;
+ union xfs_btree_ptr *block_ptr;
+ union xfs_btree_key *block_key;
+ struct xfs_btree_block *child_block;
+ struct xfs_buf *child_bp;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ &child_bp);
+ if (ret)
+ return ret;
+
+ block_ptr = xfs_btree_ptr_addr(cur, j, block);
+ xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+ block_key = xfs_btree_key_addr(cur, j, block);
+ xfs_btree_get_keys(cur, child_block, &child_key);
+ xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+ xfs_btree_get_sibling(cur, child_block, child_ptr,
+ XFS_BB_RIGHTSIB);
+ xfs_buf_relse(child_bp);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree. Caller is responsible for having set
+ * @cur->bc_ino.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int ret;
+
+ if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+ return cur->bc_ops->get_dmaxrecs(cur, level);
+
+ ret = cur->bc_ops->get_maxrecs(cur, level);
+ if (level == 0)
+ ret -= bbl->leaf_slack;
+ else
+ ret -= bbl->node_slack;
+ return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb. The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+ /* Root blocks are not subject to minrecs rules. */
+ if (level == cur->bc_nlevels - 1)
+ return max(1U, npb);
+
+ return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level. For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level,
+ uint64_t nr_this_level,
+ unsigned int *avg_per_block,
+ uint64_t *blocks,
+ uint64_t *blocks_with_extra)
+{
+ uint64_t npb;
+ uint64_t dontcare;
+ unsigned int desired_npb;
+ unsigned int maxnr;
+
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+ /*
+ * Compute the number of blocks we need to fill each block with the
+ * desired number of records/keyptrs per block. Because desired_npb
+ * could be minrecs, we use regular integer division (which rounds
+ * the block count down) so that in the next step the effective # of
+ * items per block will never be less than desired_npb.
+ */
+ desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+ *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+ *blocks = max(1ULL, *blocks);
+
+ /*
+ * Compute the number of records that we will actually put in each
+ * block, assuming that we want to spread the records evenly between
+ * the blocks. Take care that the effective # of items per block (npb)
+ * won't exceed maxrecs even for the blocks that get an extra record,
+ * since desired_npb could be maxrecs, and in the previous step we
+ * rounded the block count down.
+ */
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+ (*blocks)++;
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ }
+
+ *avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+ trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+ *avg_per_block, desired_npb, *blocks,
+ *blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs. Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+ struct xfs_btree_cur *cur,
+ int *slack,
+ int level)
+{
+ int maxr;
+ int minr;
+
+ maxr = cur->bc_ops->get_maxrecs(cur, level);
+ minr = cur->bc_ops->get_minrecs(cur, level);
+
+ /*
+ * If slack is negative, automatically set slack so that we load the
+ * btree block approximately halfway between minrecs and maxrecs.
+ * Generally, this will net us 75% loading.
+ */
+ if (*slack < 0)
+ *slack = maxr - ((maxr + minr) >> 1);
+
+ *slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in bbl. Caller must ensure that the btree cursor is a staging
+ * cursor. This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ uint64_t nr_records)
+{
+ uint64_t nr_blocks = 0;
+ uint64_t nr_this_level;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ /*
+ * Make sure that the slack values make sense for traditional leaf and
+ * node blocks. Inode-rooted btrees will return different minrecs and
+ * maxrecs values for the root block (bc_nlevels == level - 1). We're
+ * checking levels 0 and 1 here, so set bc_nlevels such that the btree
+ * code doesn't interpret either as the root level.
+ */
+ cur->bc_nlevels = cur->bc_maxlevels - 1;
+ xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+ xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+ bbl->nr_records = nr_this_level = nr_records;
+ for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
+ uint64_t level_blocks;
+ uint64_t dontcare64;
+ unsigned int level = cur->bc_nlevels - 1;
+ unsigned int avg_per_block;
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &level_blocks, &dontcare64);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * If all the items we want to store at this level
+ * would fit in the inode root block, then we have our
+ * btree root and are done.
+ *
+ * Note that bmap btrees forbid records in the root.
+ */
+ if (level != 0 && nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /*
+ * Otherwise, we have to store all the items for this
+ * level in traditional btree blocks and therefore need
+ * another level of btree to point to those blocks.
+ *
+ * We have to re-compute the geometry for each level of
+ * an inode-rooted btree because the geometry differs
+ * between a btree root in an inode fork and a
+ * traditional btree block.
+ *
+ * This distinction is made in the btree code based on
+ * whether level == bc_nlevels - 1. Based on the
+ * previous root block size check against the root
+ * block geometry, we know that we aren't yet ready to
+ * populate the root. Increment bc_nevels and
+ * recalculate the geometry for a traditional
+ * block-based btree level.
+ */
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ xfs_btree_bload_level_geometry(cur, bbl, level,
+ nr_this_level, &avg_per_block,
+ &level_blocks, &dontcare64);
+ } else {
+ /*
+ * If all the items we want to store at this level
+ * would fit in a single root block, we're done.
+ */
+ if (nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /* Otherwise, we need another level of btree. */
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ }
+
+ nr_blocks += level_blocks;
+ nr_this_level = level_blocks;
+ }
+
+ if (cur->bc_nlevels > cur->bc_maxlevels)
+ return -EOVERFLOW;
+
+ bbl->btree_height = cur->bc_nlevels;
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ bbl->nr_blocks = nr_blocks - 1;
+ else
+ bbl->nr_blocks = nr_blocks;
+ return 0;
+}
+
+/* Bulk load a btree given the parameters and geometry established in bbl. */
+int
+xfs_btree_bload(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ void *priv)
+{
+ struct list_head buffers_list;
+ union xfs_btree_ptr child_ptr;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block = NULL;
+ uint64_t nr_this_level = bbl->nr_records;
+ uint64_t blocks;
+ uint64_t i;
+ uint64_t blocks_with_extra;
+ uint64_t total_blocks = 0;
+ unsigned int avg_per_block;
+ unsigned int level = 0;
+ int ret;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ INIT_LIST_HEAD(&buffers_list);
+ cur->bc_nlevels = bbl->btree_height;
+ xfs_btree_set_ptr_null(cur, &child_ptr);
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each leaf block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ /*
+ * Due to rounding, btree blocks will not be evenly populated
+ * in most cases. blocks_with_extra tells us how many blocks
+ * will receive an extra record to distribute the excess across
+ * the current level as evenly as possible.
+ */
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
+ nr_this_block, &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+ nr_this_block);
+
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ block, priv);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost leaf pointer so we know where to start
+ * with the first node level.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ /* Populate the internal btree nodes. */
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ union xfs_btree_ptr first_ptr;
+
+ nr_this_level = blocks;
+ block = NULL;
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each node block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl,
+ &buffers_list, level, nr_this_block,
+ &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks,
+ &ptr, nr_this_block);
+
+ ret = xfs_btree_bload_node(cur, nr_this_block,
+ &child_ptr, block);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost node pointer so that we know
+ * where to start the next node level above this one.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+ xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+ }
+
+ /* Initialize the new root. */
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
+ cur->bc_ino.ifake->if_blocks = total_blocks - 1;
+ } else {
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
+ cur->bc_ag.afake->af_levels = cur->bc_nlevels;
+ cur->bc_ag.afake->af_blocks = total_blocks;
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ ret = xfs_buf_delwri_submit(&buffers_list);
+ if (ret)
+ goto out;
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ ret = -EIO;
+ }
+
+out:
+ xfs_buf_delwri_cancel(&buffers_list);
+ if (bp)
+ xfs_buf_relse(bp);
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
new file mode 100644
index 000000000..f0d297605
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_BTREE_STAGING_H__
+#define __XFS_BTREE_STAGING_H__
+
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+ /* AG block number of the new btree root. */
+ xfs_agblock_t af_root;
+
+ /* Height of the new btree. */
+ unsigned int af_levels;
+
+ /* Number of blocks used by the btree. */
+ unsigned int af_blocks;
+};
+
+/* Cursor interactions with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+ /* Fake inode fork. */
+ struct xfs_ifork *if_fork;
+
+ /* Number of blocks used by the btree. */
+ int64_t if_blocks;
+
+ /* Height of the new btree. */
+ unsigned int if_levels;
+
+ /* Number of bytes available for this fork in the inode. */
+ unsigned int if_fork_size;
+
+ /* Fork format. */
+ unsigned int if_format;
+
+ /* Number of records. */
+ unsigned int if_extents;
+};
+
+/* Cursor interactions with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ int whichfork, const struct xfs_btree_ops *ops);
+
+/* Bulk loading of staged btrees. */
+typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+ unsigned int nr_this_level, void *priv);
+
+struct xfs_btree_bload {
+ /*
+ * This function will be called nr_records times to load records into
+ * the btree. The function does this by setting the cursor's bc_rec
+ * field in in-core format. Records must be returned in sort order.
+ */
+ xfs_btree_bload_get_record_fn get_record;
+
+ /*
+ * This function will be called nr_blocks times to obtain a pointer
+ * to a new btree block on disk. Callers must preallocate all space
+ * for the new btree before calling xfs_btree_bload, and this function
+ * is what claims that reservation.
+ */
+ xfs_btree_bload_claim_block_fn claim_block;
+
+ /*
+ * This function should return the size of the in-core btree root
+ * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
+ * types.
+ */
+ xfs_btree_bload_iroot_size_fn iroot_size;
+
+ /*
+ * The caller should set this to the number of records that will be
+ * stored in the new btree.
+ */
+ uint64_t nr_records;
+
+ /*
+ * Number of free records to leave in each leaf block. If the caller
+ * sets this to -1, the slack value will be calculated to be halfway
+ * between maxrecs and minrecs. This typically leaves the block 75%
+ * full. Note that slack values are not enforced on inode root blocks.
+ */
+ int leaf_slack;
+
+ /*
+ * Number of free key/ptrs pairs to leave in each node block. This
+ * field has the same semantics as leaf_slack.
+ */
+ int node_slack;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * number of btree blocks needed to store nr_records records.
+ */
+ uint64_t nr_blocks;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * height of the new btree.
+ */
+ unsigned int btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+ void *priv);
+
+#endif /* __XFS_BTREE_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644
index 000000000..999a290cf
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED (~(uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it. The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
+ */
+static inline uint32_t
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ uint32_t zero = 0;
+ uint32_t crc;
+
+ /* Calculate CRC up to the checksum. */
+ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+ /* Skip checksum field */
+ crc = crc32c(crc, &zero, sizeof(__u32));
+
+ /* Calculate the rest of the CRC. */
+ return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+ length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ /* zero the CRC field */
+ *(__le32 *)(buffer + cksum_offset) = 0;
+
+ /* single pass CRC calculation for the entire buffer */
+ return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(uint32_t crc)
+{
+ return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
+
+ *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
+
+ return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000..e576560b4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2698 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_errortag.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+ xfs_da_state_blk_t *existing_root,
+ xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+ xfs_da_state_blk_t *existing_blk,
+ xfs_da_state_blk_t *split_blk,
+ xfs_da_state_blk_t *blk_to_add,
+ int treelevel,
+ int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *node_blk_1,
+ xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+ xfs_da_state_blk_t *old_node_blk,
+ xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+ xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+ xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *src_node_blk,
+ xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
+ xfs_da_state_blk_t *drop_blk,
+ xfs_da_state_blk_t *save_blk);
+
+
+struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+struct xfs_da_state *
+xfs_da_state_alloc(
+ struct xfs_da_args *args)
+{
+ struct xfs_da_state *state;
+
+ state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ return state;
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+ int i;
+
+ for (i = 0; i < state->altpath.active; i++)
+ state->altpath.blk[i].bp = NULL;
+ state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+ xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+ memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+ kmem_cache_free(xfs_da_state_cache, state);
+}
+
+void
+xfs_da_state_reset(
+ struct xfs_da_state *state,
+ struct xfs_da_args *args)
+{
+ xfs_da_state_kill_altpath(state);
+ memset(state, 0, sizeof(struct xfs_da_state));
+ state->args = args;
+ state->mp = state->args->dp->i_mount;
+}
+
+static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
+{
+ if (whichfork == XFS_DATA_FORK)
+ return mp->m_dir_geo->fsbcount;
+ return mp->m_attr_geo->fsbcount;
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.__count);
+ to->level = be16_to_cpu(from3->hdr.__level);
+ to->btree = from3->__btree;
+ ASSERT(to->magic == XFS_DA3_NODE_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+ to->btree = from->__btree;
+ ASSERT(to->magic == XFS_DA_NODE_MAGIC);
+ }
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.__count = cpu_to_be16(from->count);
+ to3->hdr.__level = cpu_to_be16(from->level);
+ } else {
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+ }
+}
+
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_da3_node_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_intnode *hdr = bp->b_addr;
+ struct xfs_da3_icnode_hdr ichdr;
+ xfs_failaddr_t fa;
+
+ xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr);
+
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
+
+ if (ichdr.level == 0)
+ return __this_address;
+ if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+ return __this_address;
+ if (ichdr.count == 0)
+ return __this_address;
+
+ /*
+ * we don't know if the node is for and attribute or directory tree,
+ * so only fail if the count is outside both bounds
+ */
+ if (ichdr.count > mp->m_dir_geo->node_ents &&
+ ichdr.count > mp->m_attr_geo->node_ents)
+ return __this_address;
+
+ /* XXX: hash order check? */
+
+ return NULL;
+}
+
+static void
+xfs_da3_node_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_da3_node_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA3_NODE_MAGIC:
+ if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+ xfs_verifier_error(bp, -EFSBADCRC,
+ __this_address);
+ break;
+ }
+ fallthrough;
+ case XFS_DA_NODE_MAGIC:
+ fa = xfs_da3_node_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+ break;
+ }
+}
+
+/* Verify the structure of a da3 block. */
+static xfs_failaddr_t
+xfs_da3_node_verify_struct(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA3_NODE_MAGIC:
+ case XFS_DA_NODE_MAGIC:
+ return xfs_da3_node_verify(bp);
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ return bp->b_ops->verify_struct(bp);
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ return bp->b_ops->verify_struct(bp);
+ default:
+ return __this_address;
+ }
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
+ .verify_read = xfs_da3_node_read_verify,
+ .verify_write = xfs_da3_node_write_verify,
+ .verify_struct = xfs_da3_node_verify_struct,
+};
+
+static int
+xfs_da3_node_set_type(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ return 0;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF);
+ return 0;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ return 0;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
+ info, sizeof(*info));
+ xfs_trans_brelse(tp, bp);
+ return -EFSCORRUPTED;
+ }
+}
+
+int
+xfs_da3_node_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ int error;
+
+ error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork,
+ &xfs_da3_node_buf_ops);
+ if (error || !*bpp || !tp)
+ return error;
+ return xfs_da3_node_set_type(tp, *bpp);
+}
+
+int
+xfs_da3_node_read_mapped(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
+ XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
+ bpp, &xfs_da3_node_buf_ops);
+ if (error || !*bpp)
+ return error;
+
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF);
+
+ if (!tp)
+ return 0;
+ return xfs_da3_node_set_type(tp, *bpp);
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ int level,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_da3_icnode_hdr ichdr = {0};
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_inode *dp = args->dp;
+
+ trace_xfs_da_node_create(args);
+ ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ node = bp->b_addr;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
+ ichdr.magic = XFS_DA3_NODE_MAGIC;
+ hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
+ } else {
+ ichdr.magic = XFS_DA_NODE_MAGIC;
+ }
+ ichdr.level = level;
+
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr);
+ xfs_trans_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size));
+
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int /* error */
+xfs_da3_split(
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *oldblk;
+ struct xfs_da_state_blk *newblk;
+ struct xfs_da_state_blk *addblk;
+ struct xfs_da_intnode *node;
+ int max;
+ int action = 0;
+ int error;
+ int i;
+
+ trace_xfs_da_split(state->args);
+
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
+ /*
+ * Walk back up the tree splitting/inserting/adjusting as necessary.
+ * If we need to insert and there isn't room, split the node, then
+ * decide which fragment to insert the new block from below into.
+ * Note that we may split the root this way, but we need more fixup.
+ */
+ max = state->path.active - 1;
+ ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+ ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ addblk = &state->path.blk[max]; /* initial dummy value */
+ for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+ oldblk = &state->path.blk[i];
+ newblk = &state->altpath.blk[i];
+
+ /*
+ * If a leaf node then
+ * Allocate a new leaf node, then rebalance across them.
+ * else if an intermediate node then
+ * We split on the last layer, must we split the node?
+ */
+ switch (oldblk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ error = xfs_attr3_leaf_split(state, oldblk, newblk);
+ if ((error != 0) && (error != -ENOSPC)) {
+ return error; /* GROT: attr is inconsistent */
+ }
+ if (!error) {
+ addblk = newblk;
+ break;
+ }
+ /*
+ * Entry wouldn't fit, split the leaf again. The new
+ * extrablk will be consumed by xfs_da3_node_split if
+ * the node is split.
+ */
+ state->extravalid = 1;
+ if (state->inleaf) {
+ state->extraafter = 0; /* before newblk */
+ trace_xfs_attr_leaf_split_before(state->args);
+ error = xfs_attr3_leaf_split(state, oldblk,
+ &state->extrablk);
+ } else {
+ state->extraafter = 1; /* after newblk */
+ trace_xfs_attr_leaf_split_after(state->args);
+ error = xfs_attr3_leaf_split(state, newblk,
+ &state->extrablk);
+ }
+ if (error)
+ return error; /* GROT: attr inconsistent */
+ addblk = newblk;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ error = xfs_dir2_leafn_split(state, oldblk, newblk);
+ if (error)
+ return error;
+ addblk = newblk;
+ break;
+ case XFS_DA_NODE_MAGIC:
+ error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+ max - i, &action);
+ addblk->bp = NULL;
+ if (error)
+ return error; /* GROT: dir is inconsistent */
+ /*
+ * Record the newly split block for the next time thru?
+ */
+ if (action)
+ addblk = newblk;
+ else
+ addblk = NULL;
+ break;
+ }
+
+ /*
+ * Update the btree to show the new hashval for this child.
+ */
+ xfs_da3_fixhashpath(state, &state->path);
+ }
+ if (!addblk)
+ return 0;
+
+ /*
+ * xfs_da3_node_split() should have consumed any extra blocks we added
+ * during a double leaf split in the attr fork. This is guaranteed as
+ * we can't be here if the attr fork only has a single leaf block.
+ */
+ ASSERT(state->extravalid == 0 ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
+ * Split the root node.
+ */
+ ASSERT(state->path.active == 0);
+ oldblk = &state->path.blk[0];
+ error = xfs_da3_root_split(state, oldblk, addblk);
+ if (error)
+ goto out;
+
+ /*
+ * Update pointers to the node which used to be block 0 and just got
+ * bumped because of the addition of a new root node. Note that the
+ * original block 0 could be at any position in the list of blocks in
+ * the tree.
+ *
+ * Note: the magic numbers and sibling pointers are in the same physical
+ * place for both v2 and v3 headers (by design). Hence it doesn't matter
+ * which version of the xfs_da_intnode structure we use here as the
+ * result will be the same using either structure.
+ */
+ node = oldblk->bp->b_addr;
+ if (node->hdr.info.forw) {
+ if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ node = addblk->bp->b_addr;
+ node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
+ }
+ node = oldblk->bp->b_addr;
+ if (node->hdr.info.back) {
+ if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ node = addblk->bp->b_addr;
+ node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
+ }
+out:
+ addblk->bp = NULL;
+ return error;
+}
+
+/*
+ * Split the root. We have to create a new root and point to the two
+ * parts (the split old root) that we just created. Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int /* error */
+xfs_da3_root_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_dir2_leaf *leaf;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+ int size;
+
+ trace_xfs_da_root_split(state->args);
+
+ /*
+ * Copy the existing (incorrect) block from the root node position
+ * to a free space somewhere.
+ */
+ args = state->args;
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error)
+ return error;
+
+ dp = args->dp;
+ tp = args->trans;
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork);
+ if (error)
+ return error;
+ node = bp->b_addr;
+ oldroot = blk1->bp->b_addr;
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_icnode_hdr icnodehdr;
+
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot);
+ btree = icnodehdr.btree;
+ size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
+ level = icnodehdr.level;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ } else {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ leaf = (xfs_dir2_leaf_t *)oldroot;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+ size = (int)((char *)&leafhdr.ents[leafhdr.count] -
+ (char *)leaf);
+ level = 0;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ }
+
+ /*
+ * we can copy most of the information in the node from one block to
+ * another, but for CRC enabled headers we have to make sure that the
+ * block specific identifiers are kept intact. We update the buffer
+ * directly for this.
+ */
+ memcpy(node, oldroot, size);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+ node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ }
+ xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+ bp->b_ops = blk1->bp->b_ops;
+ xfs_trans_buf_copy_type(bp, blk1->bp);
+ blk1->bp = bp;
+ blk1->blkno = blkno;
+
+ /*
+ * Set up the new root node.
+ */
+ error = xfs_da3_node_create(args,
+ (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+ level + 1, &bp, args->whichfork);
+ if (error)
+ return error;
+
+ node = bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
+ btree[0].hashval = cpu_to_be32(blk1->hashval);
+ btree[0].before = cpu_to_be32(blk1->blkno);
+ btree[1].hashval = cpu_to_be32(blk2->hashval);
+ btree[1].before = cpu_to_be32(blk2->blkno);
+ nodehdr.count = 2;
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+
+#ifdef DEBUG
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ ASSERT(blk1->blkno >= args->geo->leafblk &&
+ blk1->blkno < args->geo->freeblk);
+ ASSERT(blk2->blkno >= args->geo->leafblk &&
+ blk2->blkno < args->geo->freeblk);
+ }
+#endif
+
+ /* Header is already logged by xfs_da_node_create */
+ xfs_trans_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+
+ return 0;
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int /* error */
+xfs_da3_node_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk,
+ struct xfs_da_state_blk *addblk,
+ int treelevel,
+ int *result)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno;
+ int newcount;
+ int error;
+ int useextra;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_split(state->args);
+
+ node = oldblk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+
+ /*
+ * With V2 dirs the extra block is data or freespace.
+ */
+ useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+ newcount = 1 + useextra;
+ /*
+ * Do we have to split the node?
+ */
+ if (nodehdr.count + newcount > state->args->geo->node_ents) {
+ /*
+ * Allocate a new node, add to the doubly linked chain of
+ * nodes, then move some of our excess entries into it.
+ */
+ error = xfs_da_grow_inode(state->args, &blkno);
+ if (error)
+ return error; /* GROT: dir is inconsistent */
+
+ error = xfs_da3_node_create(state->args, blkno, treelevel,
+ &newblk->bp, state->args->whichfork);
+ if (error)
+ return error; /* GROT: dir is inconsistent */
+ newblk->blkno = blkno;
+ newblk->magic = XFS_DA_NODE_MAGIC;
+ xfs_da3_node_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
+ if (error)
+ return error;
+ *result = 1;
+ } else {
+ *result = 0;
+ }
+
+ /*
+ * Insert the new entry(s) into the correct block
+ * (updating last hashval in the process).
+ *
+ * xfs_da3_node_add() inserts BEFORE the given index,
+ * and as a result of using node_lookup_int() we always
+ * point to a valid entry (not after one), but a split
+ * operation always results in a new block whose hashvals
+ * FOLLOW the current block.
+ *
+ * If we had double-split op below us, then add the extra block too.
+ */
+ node = oldblk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ if (oldblk->index <= nodehdr.count) {
+ oldblk->index++;
+ xfs_da3_node_add(state, oldblk, addblk);
+ if (useextra) {
+ if (state->extraafter)
+ oldblk->index++;
+ xfs_da3_node_add(state, oldblk, &state->extrablk);
+ state->extravalid = 0;
+ }
+ } else {
+ newblk->index++;
+ xfs_da3_node_add(state, newblk, addblk);
+ if (useextra) {
+ if (state->extraafter)
+ newblk->index++;
+ xfs_da3_node_add(state, newblk, &state->extrablk);
+ state->extravalid = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
+{
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da_node_entry *btree_s;
+ struct xfs_da_node_entry *btree_d;
+ struct xfs_da3_icnode_hdr nodehdr1;
+ struct xfs_da3_icnode_hdr nodehdr2;
+ struct xfs_trans *tp;
+ int count;
+ int tmp;
+ int swap = 0;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_rebalance(state->args);
+
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
+
+ /*
+ * Figure out how many entries need to move, and in which direction.
+ * Swap the nodes around if that makes it simpler.
+ */
+ if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+ be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+ swap(node1, node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
+ swap = 1;
+ }
+
+ count = (nodehdr1.count - nodehdr2.count) / 2;
+ if (count == 0)
+ return;
+ tp = state->args->trans;
+ /*
+ * Two cases: high-to-low and low-to-high.
+ */
+ if (count > 0) {
+ /*
+ * Move elements in node2 up to make a hole.
+ */
+ tmp = nodehdr2.count;
+ if (tmp > 0) {
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &btree2[0];
+ btree_d = &btree2[count];
+ memmove(btree_d, btree_s, tmp);
+ }
+
+ /*
+ * Move the req'd B-tree elements from high in node1 to
+ * low in node2.
+ */
+ nodehdr2.count += count;
+ tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &btree1[nodehdr1.count - count];
+ btree_d = &btree2[0];
+ memcpy(btree_d, btree_s, tmp);
+ nodehdr1.count -= count;
+ } else {
+ /*
+ * Move the req'd B-tree elements from low in node2 to
+ * high in node1.
+ */
+ count = -count;
+ tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &btree2[0];
+ btree_d = &btree1[nodehdr1.count];
+ memcpy(btree_d, btree_s, tmp);
+ nodehdr1.count += count;
+
+ xfs_trans_log_buf(tp, blk1->bp,
+ XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+ /*
+ * Move elements in node2 down to fill the hole.
+ */
+ tmp = nodehdr2.count - count;
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &btree2[count];
+ btree_d = &btree2[0];
+ memmove(btree_d, btree_s, tmp);
+ nodehdr2.count -= count;
+ }
+
+ /*
+ * Log header of node 1 and all current bits of node 2.
+ */
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1);
+ xfs_trans_log_buf(tp, blk1->bp,
+ XFS_DA_LOGRANGE(node1, &node1->hdr,
+ state->args->geo->node_hdr_size));
+
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2);
+ xfs_trans_log_buf(tp, blk2->bp,
+ XFS_DA_LOGRANGE(node2, &node2->hdr,
+ state->args->geo->node_hdr_size +
+ (sizeof(btree2[0]) * nodehdr2.count)));
+
+ /*
+ * Record the last hashval from each block for upward propagation.
+ * (note: don't use the swapped node pointers)
+ */
+ if (swap) {
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
+ }
+ blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+
+ /*
+ * Adjust the expected index for insertion.
+ */
+ if (blk1->index >= nodehdr1.count) {
+ blk2->index = blk1->index - nodehdr1.count;
+ blk1->index = nodehdr1.count + 1; /* make it invalid */
+ }
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_add(state->args);
+
+ node = oldblk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
+
+ ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+ ASSERT(newblk->blkno != 0);
+ if (state->args->whichfork == XFS_DATA_FORK)
+ ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+ newblk->blkno < state->args->geo->freeblk);
+
+ /*
+ * We may need to make some room before we insert the new node.
+ */
+ tmp = 0;
+ if (oldblk->index < nodehdr.count) {
+ tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+ memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+ }
+ btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+ btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+ tmp + sizeof(*btree)));
+
+ nodehdr.count += 1;
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr,
+ state->args->geo->node_hdr_size));
+
+ /*
+ * Copy the last hash value from the oldblk to propagate upwards.
+ */
+ oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *drop_blk;
+ struct xfs_da_state_blk *save_blk;
+ int action = 0;
+ int error;
+
+ trace_xfs_da_join(state->args);
+
+ drop_blk = &state->path.blk[ state->path.active-1 ];
+ save_blk = &state->altpath.blk[ state->path.active-1 ];
+ ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+ ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
+ * Walk back up the tree joining/deallocating as necessary.
+ * When we stop dropping blocks, break out.
+ */
+ for ( ; state->path.active >= 2; drop_blk--, save_blk--,
+ state->path.active--) {
+ /*
+ * See if we can combine the block with a neighbor.
+ * (action == 0) => no options, just leave
+ * (action == 1) => coalesce, then unlink
+ * (action == 2) => block empty, unlink it
+ */
+ switch (drop_blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ error = xfs_attr3_leaf_toosmall(state, &action);
+ if (error)
+ return error;
+ if (action == 0)
+ return 0;
+ xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ error = xfs_dir2_leafn_toosmall(state, &action);
+ if (error)
+ return error;
+ if (action == 0)
+ return 0;
+ xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ /*
+ * Remove the offending node, fixup hashvals,
+ * check for a toosmall neighbor.
+ */
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_node_toosmall(state, &action);
+ if (error)
+ return error;
+ if (action == 0)
+ return 0;
+ xfs_da3_node_unbalance(state, drop_blk, save_blk);
+ break;
+ }
+ xfs_da3_fixhashpath(state, &state->altpath);
+ error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+ xfs_da_state_kill_altpath(state);
+ if (error)
+ return error;
+ error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+ drop_blk->bp);
+ drop_blk->bp = NULL;
+ if (error)
+ return error;
+ }
+ /*
+ * We joined all the way to the top. If it turns out that
+ * we only have one entry in the root, make the child block
+ * the new root.
+ */
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_root_join(state, &state->path.blk[0]);
+ return error;
+}
+
+#ifdef DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+ __be16 magic = blkinfo->magic;
+
+ if (level == 1) {
+ ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+ } else {
+ ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ }
+ ASSERT(!blkinfo->forw);
+ ASSERT(!blkinfo->back);
+}
+#else /* !DEBUG */
+#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif /* !DEBUG */
+
+/*
+ * We have only one entry in the root. Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *root_blk)
+{
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_args *args;
+ xfs_dablk_t child;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr oldroothdr;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_root_join(state->args);
+
+ ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+ args = state->args;
+ oldroot = root_blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot);
+ ASSERT(oldroothdr.forw == 0);
+ ASSERT(oldroothdr.back == 0);
+
+ /*
+ * If the root has more than one child, then don't do anything.
+ */
+ if (oldroothdr.count > 1)
+ return 0;
+
+ /*
+ * Read in the (only) child block, then copy those bytes into
+ * the root block's buffer and free the original child block.
+ */
+ child = be32_to_cpu(oldroothdr.btree[0].before);
+ ASSERT(child != 0);
+ error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
+ if (error)
+ return error;
+ xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+ /*
+ * This could be copying a leaf back into the root block in the case of
+ * there only being a single leaf block left in the tree. Hence we have
+ * to update the b_ops pointer as well to match the buffer type change
+ * that could occur. For dir3 blocks we also need to update the block
+ * number in the buffer header.
+ */
+ memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+ root_blk->bp->b_ops = bp->b_ops;
+ xfs_trans_buf_copy_type(root_blk->bp, bp);
+ if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+ struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
+ }
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+ args->geo->blksize - 1);
+ error = xfs_da_shrink_inode(args, child, bp);
+ return error;
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+ struct xfs_da_state *state,
+ int *action)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t blkno;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int count;
+ int forward;
+ int error;
+ int retval;
+ int i;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_toosmall(state->args);
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ info = blk->bp->b_addr;
+ node = (xfs_da_intnode_t *)info;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+ *action = 0; /* blk over 50%, don't try to join */
+ return 0; /* blk over 50%, don't try to join */
+ }
+
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (nodehdr.count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (info->forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 2;
+ }
+ return 0;
+ }
+
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink a directory over time.
+ */
+ count = state->args->geo->node_ents;
+ count -= state->args->geo->node_ents >> 2;
+ count -= nodehdr.count;
+
+ /* start with smaller blk num */
+ forward = nodehdr.forw < nodehdr.back;
+ for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_da3_icnode_hdr thdr;
+ if (forward)
+ blkno = nodehdr.forw;
+ else
+ blkno = nodehdr.back;
+ if (blkno == 0)
+ continue;
+ error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp,
+ state->args->whichfork);
+ if (error)
+ return error;
+
+ node = bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
+ xfs_trans_brelse(state->args->trans, bp);
+
+ if (count - thdr.count >= 0)
+ break; /* fits with at least 25% to spare */
+ }
+ if (i >= 2) {
+ *action = 0;
+ return 0;
+ }
+
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno) {
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ } else {
+ error = xfs_da3_path_shift(state, &state->path, forward,
+ 0, &retval);
+ }
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ return 0;
+ }
+ *action = 1;
+ return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int *count)
+{
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr);
+ if (count)
+ *count = nodehdr.count;
+ if (!nodehdr.count)
+ return 0;
+ return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ xfs_dahash_t lasthash=0;
+ int level;
+ int count;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_fixhashpath(state->args);
+
+ level = path->active-1;
+ blk = &path->blk[ level ];
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ case XFS_DA_NODE_MAGIC:
+ lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ }
+ for (blk--, level--; level >= 0; blk--, level--) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ node = blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
+ if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+ break;
+ blk->hashval = lasthash;
+ btree[blk->index].hashval = cpu_to_be32(lasthash);
+ xfs_trans_log_buf(state->args->trans, blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[blk->index],
+ sizeof(*btree)));
+
+ lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+ }
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int index;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_remove(state->args);
+
+ node = drop_blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ ASSERT(drop_blk->index < nodehdr.count);
+ ASSERT(drop_blk->index >= 0);
+
+ /*
+ * Copy over the offending entry, or just zero it out.
+ */
+ index = drop_blk->index;
+ btree = nodehdr.btree;
+ if (index < nodehdr.count - 1) {
+ tmp = nodehdr.count - index - 1;
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ memmove(&btree[index], &btree[index + 1], tmp);
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[index], tmp));
+ index = nodehdr.count - 1;
+ }
+ memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+ nodehdr.count -= 1;
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size));
+
+ /*
+ * Copy the last hash value from the block to propagate upwards.
+ */
+ drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
+{
+ struct xfs_da_intnode *drop_node;
+ struct xfs_da_intnode *save_node;
+ struct xfs_da_node_entry *drop_btree;
+ struct xfs_da_node_entry *save_btree;
+ struct xfs_da3_icnode_hdr drop_hdr;
+ struct xfs_da3_icnode_hdr save_hdr;
+ struct xfs_trans *tp;
+ int sindex;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_unbalance(state->args);
+
+ drop_node = drop_blk->bp->b_addr;
+ save_node = save_blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node);
+ drop_btree = drop_hdr.btree;
+ save_btree = save_hdr.btree;
+ tp = state->args->trans;
+
+ /*
+ * If the dying block has lower hashvals, then move all the
+ * elements in the remaining block up to make a hole.
+ */
+ if ((be32_to_cpu(drop_btree[0].hashval) <
+ be32_to_cpu(save_btree[0].hashval)) ||
+ (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+ be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+ /* XXX: check this - is memmove dst correct? */
+ tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+ memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+ sindex = 0;
+ xfs_trans_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_btree[0],
+ (save_hdr.count + drop_hdr.count) *
+ sizeof(xfs_da_node_entry_t)));
+ } else {
+ sindex = save_hdr.count;
+ xfs_trans_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+ drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+ }
+
+ /*
+ * Move all the B-tree elements from drop_blk to save_blk.
+ */
+ tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+ memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+ save_hdr.count += drop_hdr.count;
+
+ xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr);
+ xfs_trans_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+ state->args->geo->node_hdr_size));
+
+ /*
+ * Save the last hashval in the remaining block for upward propagation.
+ */
+ save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend. This is a
+ * pruned depth-first tree search.
+ */
+int /* error */
+xfs_da3_node_lookup_int(
+ struct xfs_da_state *state,
+ int *result)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *curr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ xfs_dablk_t blkno;
+ xfs_dahash_t hashval;
+ xfs_dahash_t btreehashval;
+ int probe;
+ int span;
+ int max;
+ int error;
+ int retval;
+ unsigned int expected_level = 0;
+ uint16_t magic;
+ struct xfs_inode *dp = state->args->dp;
+
+ args = state->args;
+
+ /*
+ * Descend thru the B-tree searching each level for the right
+ * node to use, until the right hashval is found.
+ */
+ blkno = args->geo->leafblk;
+ for (blk = &state->path.blk[0], state->path.active = 1;
+ state->path.active <= XFS_DA_NODE_MAXDEPTH;
+ blk++, state->path.active++) {
+ /*
+ * Read the next node down in the tree.
+ */
+ blk->blkno = blkno;
+ error = xfs_da3_node_read(args->trans, args->dp, blkno,
+ &blk->bp, args->whichfork);
+ if (error) {
+ blk->blkno = 0;
+ state->path.active--;
+ return error;
+ }
+ curr = blk->bp->b_addr;
+ magic = be16_to_cpu(curr->magic);
+
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC) {
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ }
+
+ if (magic == XFS_DIR2_LEAFN_MAGIC ||
+ magic == XFS_DIR3_LEAFN_MAGIC) {
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ }
+
+ if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
+ xfs_buf_mark_corrupt(blk->bp);
+ return -EFSCORRUPTED;
+ }
+
+ blk->magic = XFS_DA_NODE_MAGIC;
+
+ /*
+ * Search an intermediate node for a match.
+ */
+ node = blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
+
+ /* Tree taller than we can handle; bail out! */
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(blk->bp);
+ return -EFSCORRUPTED;
+ }
+
+ /* Check the level from the root. */
+ if (blkno == args->geo->leafblk)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level) {
+ xfs_buf_mark_corrupt(blk->bp);
+ return -EFSCORRUPTED;
+ } else
+ expected_level--;
+
+ max = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+
+ /*
+ * Binary search. (note: small blocks will skip loop)
+ */
+ probe = span = max / 2;
+ hashval = args->hashval;
+ while (span > 4) {
+ span /= 2;
+ btreehashval = be32_to_cpu(btree[probe].hashval);
+ if (btreehashval < hashval)
+ probe += span;
+ else if (btreehashval > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ ASSERT((probe >= 0) && (probe < max));
+ ASSERT((span <= 4) ||
+ (be32_to_cpu(btree[probe].hashval) == hashval));
+
+ /*
+ * Since we may have duplicate hashval's, find the first
+ * matching hashval in the node.
+ */
+ while (probe > 0 &&
+ be32_to_cpu(btree[probe].hashval) >= hashval) {
+ probe--;
+ }
+ while (probe < max &&
+ be32_to_cpu(btree[probe].hashval) < hashval) {
+ probe++;
+ }
+
+ /*
+ * Pick the right block to descend on.
+ */
+ if (probe == max) {
+ blk->index = max - 1;
+ blkno = be32_to_cpu(btree[max - 1].before);
+ } else {
+ blk->index = probe;
+ blkno = be32_to_cpu(btree[probe].before);
+ }
+
+ /* We can't point back to the root. */
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+ return -EFSCORRUPTED;
+
+ /*
+ * A leaf block that ends in the hashval that we are interested in
+ * (final hashval == search hashval) means that the next block may
+ * contain more entries with the same hashval, shift upward to the
+ * next leaf and keep searching.
+ */
+ for (;;) {
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+ retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+ &blk->index, state);
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+ blk->index = args->index;
+ args->blkno = blk->blkno;
+ } else {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
+ (blk->hashval == args->hashval)) {
+ error = xfs_da3_path_shift(state, &state->path, 1, 1,
+ &retval);
+ if (error)
+ return error;
+ if (retval == 0) {
+ continue;
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ /* path_shift() gives ENOENT */
+ retval = -ENOATTR;
+ }
+ }
+ break;
+ }
+ *result = retval;
+ return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+ struct xfs_inode *dp,
+ struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp)
+{
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da3_icnode_hdr node1hdr;
+ struct xfs_da3_icnode_hdr node2hdr;
+
+ node1 = node1_bp->b_addr;
+ node2 = node2_bp->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2);
+ btree1 = node1hdr.btree;
+ btree2 = node2hdr.btree;
+
+ if (node1hdr.count > 0 && node2hdr.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+ be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int /* error */
+xfs_da3_blk_link(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *old_blk,
+ struct xfs_da_state_blk *new_blk)
+{
+ struct xfs_da_blkinfo *old_info;
+ struct xfs_da_blkinfo *new_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int before = 0;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
+
+ /*
+ * Set up environment.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ old_info = old_blk->bp->b_addr;
+ new_info = new_blk->bp->b_addr;
+ ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+ old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ switch (old_blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+ break;
+ }
+
+ /*
+ * Link blocks in appropriate order.
+ */
+ if (before) {
+ /*
+ * Link new block in before existing block.
+ */
+ trace_xfs_da_link_before(args);
+ new_info->forw = cpu_to_be32(old_blk->blkno);
+ new_info->back = old_info->back;
+ if (old_info->back) {
+ error = xfs_da3_node_read(args->trans, dp,
+ be32_to_cpu(old_info->back),
+ &bp, args->whichfork);
+ if (error)
+ return error;
+ ASSERT(bp != NULL);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == old_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+ tmp_info->forw = cpu_to_be32(new_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+ }
+ old_info->back = cpu_to_be32(new_blk->blkno);
+ } else {
+ /*
+ * Link new block in after existing block.
+ */
+ trace_xfs_da_link_after(args);
+ new_info->forw = old_info->forw;
+ new_info->back = cpu_to_be32(old_blk->blkno);
+ if (old_info->forw) {
+ error = xfs_da3_node_read(args->trans, dp,
+ be32_to_cpu(old_info->forw),
+ &bp, args->whichfork);
+ if (error)
+ return error;
+ ASSERT(bp != NULL);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == old_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+ tmp_info->back = cpu_to_be32(new_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+ }
+ old_info->forw = cpu_to_be32(new_blk->blkno);
+ }
+
+ xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+ return 0;
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int /* error */
+xfs_da3_blk_unlink(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
+{
+ struct xfs_da_blkinfo *drop_info;
+ struct xfs_da_blkinfo *save_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int error;
+
+ /*
+ * Set up environment.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ save_info = save_blk->bp->b_addr;
+ drop_info = drop_blk->bp->b_addr;
+ ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+ save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(save_blk->magic == drop_blk->magic);
+ ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+ (be32_to_cpu(save_info->back) == drop_blk->blkno));
+ ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+ (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+ /*
+ * Unlink the leaf block from the doubly linked chain of leaves.
+ */
+ if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+ trace_xfs_da_unlink_back(args);
+ save_info->back = drop_info->back;
+ if (drop_info->back) {
+ error = xfs_da3_node_read(args->trans, args->dp,
+ be32_to_cpu(drop_info->back),
+ &bp, args->whichfork);
+ if (error)
+ return error;
+ ASSERT(bp != NULL);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+ tmp_info->forw = cpu_to_be32(save_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ sizeof(*tmp_info) - 1);
+ }
+ } else {
+ trace_xfs_da_unlink_forward(args);
+ save_info->forw = drop_info->forw;
+ if (drop_info->forw) {
+ error = xfs_da3_node_read(args->trans, args->dp,
+ be32_to_cpu(drop_info->forw),
+ &bp, args->whichfork);
+ if (error)
+ return error;
+ ASSERT(bp != NULL);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+ tmp_info->back = cpu_to_be32(save_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ sizeof(*tmp_info) - 1);
+ }
+ }
+
+ xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+ return 0;
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int /* error */
+xfs_da3_path_shift(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path,
+ int forward,
+ int release,
+ int *result)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ struct xfs_da_args *args;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_buf *bp;
+ xfs_dablk_t blkno = 0;
+ int level;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_path_shift(state->args);
+
+ /*
+ * Roll up the Btree looking for the first block where our
+ * current index is not at the edge of the block. Note that
+ * we skip the bottom layer because we want the sibling block.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ ASSERT(path != NULL);
+ ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ level = (path->active-1) - 1; /* skip bottom layer in path */
+ for (; level >= 0; level--) {
+ blk = &path->blk[level];
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ blk->bp->b_addr);
+
+ if (forward && (blk->index < nodehdr.count - 1)) {
+ blk->index++;
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
+ break;
+ } else if (!forward && (blk->index > 0)) {
+ blk->index--;
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
+ break;
+ }
+ }
+ if (level < 0) {
+ *result = -ENOENT; /* we're out of our tree */
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ return 0;
+ }
+
+ /*
+ * Roll down the edge of the subtree until we reach the
+ * same depth we were at originally.
+ */
+ for (blk++, level++; level < path->active; blk++, level++) {
+ /*
+ * Read the next child block into a local buffer.
+ */
+ error = xfs_da3_node_read(args->trans, dp, blkno, &bp,
+ args->whichfork);
+ if (error)
+ return error;
+
+ /*
+ * Release the old block (if it's dirty, the trans doesn't
+ * actually let go) and swap the local buffer into the path
+ * structure. This ensures failure of the above read doesn't set
+ * a NULL buffer in an active slot in the path.
+ */
+ if (release)
+ xfs_trans_brelse(args->trans, blk->bp);
+ blk->blkno = blkno;
+ blk->bp = bp;
+
+ info = blk->bp->b_addr;
+ ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+ /*
+ * Note: we flatten the magic number to a single type so we
+ * don't have to compare against crc/non-crc types elsewhere.
+ */
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ blk->magic = XFS_DA_NODE_MAGIC;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ bp->b_addr);
+ btree = nodehdr.btree;
+ blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+ if (forward)
+ blk->index = 0;
+ else
+ blk->index = nodehdr.count - 1;
+ blkno = be32_to_cpu(btree[blk->index].before);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+ *result = 0;
+ return 0;
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const uint8_t *name, int namelen)
+{
+ xfs_dahash_t hash;
+
+ /*
+ * Do four characters at a time as long as we can.
+ */
+ for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+ hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+ (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+ /*
+ * Now do the rest of the characters.
+ */
+ switch (namelen) {
+ case 3:
+ return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+ rol32(hash, 7 * 3);
+ case 2:
+ return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+ case 1:
+ return (name[0] << 0) ^ rol32(hash, 7 * 1);
+ default: /* case 0: */
+ return hash;
+ }
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+ XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+int
+xfs_da_grow_inode_int(
+ struct xfs_da_args *args,
+ xfs_fileoff_t *bno,
+ int count)
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ int w = args->whichfork;
+ xfs_rfsblock_t nblks = dp->i_nblocks;
+ struct xfs_bmbt_irec map, *mapp;
+ int nmap, error, got, i, mapi;
+
+ /*
+ * Find a spot in the file space to put the new block.
+ */
+ error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+ if (error)
+ return error;
+
+ /*
+ * Try mapping it in one filesystem block.
+ */
+ nmap = 1;
+ error = xfs_bmapi_write(tp, dp, *bno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+ args->total, &map, &nmap);
+ if (error)
+ return error;
+
+ ASSERT(nmap <= 1);
+ if (nmap == 1) {
+ mapp = &map;
+ mapi = 1;
+ } else if (nmap == 0 && count > 1) {
+ xfs_fileoff_t b;
+ int c;
+
+ /*
+ * If we didn't get it and the block might work if fragmented,
+ * try without the CONTIG flag. Loop until we get it all.
+ */
+ mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+ for (b = *bno, mapi = 0; b < *bno + count; ) {
+ c = (int)(*bno + count - b);
+ nmap = min(XFS_BMAP_MAX_NMAP, c);
+ error = xfs_bmapi_write(tp, dp, b, c,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+ args->total, &mapp[mapi], &nmap);
+ if (error)
+ goto out_free_map;
+ if (nmap < 1)
+ break;
+ mapi += nmap;
+ b = mapp[mapi - 1].br_startoff +
+ mapp[mapi - 1].br_blockcount;
+ }
+ } else {
+ mapi = 0;
+ mapp = NULL;
+ }
+
+ /*
+ * Count the blocks we got, make sure it matches the total.
+ */
+ for (i = 0, got = 0; i < mapi; i++)
+ got += mapp[i].br_blockcount;
+ if (got != count || mapp[0].br_startoff != *bno ||
+ mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+ *bno + count) {
+ error = -ENOSPC;
+ goto out_free_map;
+ }
+
+ /* account for newly allocated blocks in reserved blocks total */
+ args->total -= dp->i_nblocks - nblks;
+
+out_free_map:
+ if (mapp != &map)
+ kmem_free(mapp);
+ return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+ struct xfs_da_args *args,
+ xfs_dablk_t *new_blkno)
+{
+ xfs_fileoff_t bno;
+ int error;
+
+ trace_xfs_da_grow_inode(args);
+
+ bno = args->geo->leafblk;
+ error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+ if (!error)
+ *new_blkno = (xfs_dablk_t)bno;
+ return error;
+}
+
+/*
+ * Ick. We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file. The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+ struct xfs_da_args *args,
+ xfs_dablk_t *dead_blknop,
+ struct xfs_buf **dead_bufp)
+{
+ struct xfs_da_blkinfo *dead_info;
+ struct xfs_da_blkinfo *sib_info;
+ struct xfs_da_intnode *par_node;
+ struct xfs_da_intnode *dead_node;
+ struct xfs_dir2_leaf *dead_leaf2;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr par_hdr;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_buf *dead_buf;
+ struct xfs_buf *last_buf;
+ struct xfs_buf *sib_buf;
+ struct xfs_buf *par_buf;
+ xfs_dahash_t dead_hash;
+ xfs_fileoff_t lastoff;
+ xfs_dablk_t dead_blkno;
+ xfs_dablk_t last_blkno;
+ xfs_dablk_t sib_blkno;
+ xfs_dablk_t par_blkno;
+ int error;
+ int w;
+ int entno;
+ int level;
+ int dead_level;
+
+ trace_xfs_da_swap_lastblock(args);
+
+ dead_buf = *dead_bufp;
+ dead_blkno = *dead_blknop;
+ tp = args->trans;
+ dp = args->dp;
+ w = args->whichfork;
+ ASSERT(w == XFS_DATA_FORK);
+ mp = dp->i_mount;
+ lastoff = args->geo->freeblk;
+ error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, lastoff == 0))
+ return -EFSCORRUPTED;
+ /*
+ * Read the last block in the btree space.
+ */
+ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+ error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
+ if (error)
+ return error;
+ /*
+ * Copy the last block into the dead buffer and log it.
+ */
+ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+ dead_info = dead_buf->b_addr;
+ /*
+ * Get values from the moved block.
+ */
+ if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
+ dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr,
+ dead_leaf2);
+ ents = leafhdr.ents;
+ dead_level = 0;
+ dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+ } else {
+ struct xfs_da3_icnode_hdr deadhdr;
+
+ dead_node = (xfs_da_intnode_t *)dead_info;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node);
+ btree = deadhdr.btree;
+ dead_level = deadhdr.level;
+ dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+ }
+ sib_buf = par_buf = NULL;
+ /*
+ * If the moved block has a left sibling, fix up the pointers.
+ */
+ if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
+ if (error)
+ goto done;
+ sib_info = sib_buf->b_addr;
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->forw) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ sib_info->forw = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, sib_buf,
+ XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+ sizeof(sib_info->forw)));
+ sib_buf = NULL;
+ }
+ /*
+ * If the moved block has a right sibling, fix up the pointers.
+ */
+ if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
+ if (error)
+ goto done;
+ sib_info = sib_buf->b_addr;
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->back) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ sib_info->back = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, sib_buf,
+ XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+ sizeof(sib_info->back)));
+ sib_buf = NULL;
+ }
+ par_blkno = args->geo->leafblk;
+ level = -1;
+ /*
+ * Walk down the tree looking for the parent of the moved block.
+ */
+ for (;;) {
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
+ if (error)
+ goto done;
+ par_node = par_buf->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp,
+ level >= 0 && level != par_hdr.level + 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ level = par_hdr.level;
+ btree = par_hdr.btree;
+ for (entno = 0;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].hashval) < dead_hash;
+ entno++)
+ continue;
+ if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ par_blkno = be32_to_cpu(btree[entno].before);
+ if (level == dead_level + 1)
+ break;
+ xfs_trans_brelse(tp, par_buf);
+ par_buf = NULL;
+ }
+ /*
+ * We're in the right parent block.
+ * Look for the right entry.
+ */
+ for (;;) {
+ for (;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].before) != last_blkno;
+ entno++)
+ continue;
+ if (entno < par_hdr.count)
+ break;
+ par_blkno = par_hdr.forw;
+ xfs_trans_brelse(tp, par_buf);
+ par_buf = NULL;
+ if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
+ if (error)
+ goto done;
+ par_node = par_buf->b_addr;
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ btree = par_hdr.btree;
+ entno = 0;
+ }
+ /*
+ * Update the parent entry pointing to the moved block.
+ */
+ btree[entno].before = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, par_buf,
+ XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+ sizeof(btree[entno].before)));
+ *dead_blknop = last_blkno;
+ *dead_bufp = last_buf;
+ return 0;
+done:
+ if (par_buf)
+ xfs_trans_brelse(tp, par_buf);
+ if (sib_buf)
+ xfs_trans_brelse(tp, sib_buf);
+ xfs_trans_brelse(tp, last_buf);
+ return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+ struct xfs_da_args *args,
+ xfs_dablk_t dead_blkno,
+ struct xfs_buf *dead_buf)
+{
+ struct xfs_inode *dp;
+ int done, error, w, count;
+ struct xfs_trans *tp;
+
+ trace_xfs_da_shrink_inode(args);
+
+ dp = args->dp;
+ w = args->whichfork;
+ tp = args->trans;
+ count = args->geo->fsbcount;
+ for (;;) {
+ /*
+ * Remove extents. If we get ENOSPC for a dir we have to move
+ * the last block to the place we want to kill.
+ */
+ error = xfs_bunmapi(tp, dp, dead_blkno, count,
+ xfs_bmapi_aflag(w), 0, &done);
+ if (error == -ENOSPC) {
+ if (w != XFS_DATA_FORK)
+ break;
+ error = xfs_da3_swap_lastblock(args, &dead_blkno,
+ &dead_buf);
+ if (error)
+ break;
+ } else {
+ break;
+ }
+ }
+ xfs_trans_binval(tp, dead_buf);
+ return error;
+}
+
+static int
+xfs_dabuf_map(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags,
+ int whichfork,
+ struct xfs_buf_map **mapp,
+ int *nmaps)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int nfsb = xfs_dabuf_nfsb(mp, whichfork);
+ struct xfs_bmbt_irec irec, *irecs = &irec;
+ struct xfs_buf_map *map = *mapp;
+ xfs_fileoff_t off = bno;
+ int error = 0, nirecs, i;
+
+ if (nfsb > 1)
+ irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+
+ nirecs = nfsb;
+ error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
+ xfs_bmapi_aflag(whichfork));
+ if (error)
+ goto out_free_irecs;
+
+ /*
+ * Use the caller provided map for the single map case, else allocate a
+ * larger one that needs to be free by the caller.
+ */
+ if (nirecs > 1) {
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+ if (!map) {
+ error = -ENOMEM;
+ goto out_free_irecs;
+ }
+ *mapp = map;
+ }
+
+ for (i = 0; i < nirecs; i++) {
+ if (irecs[i].br_startblock == HOLESTARTBLOCK ||
+ irecs[i].br_startblock == DELAYSTARTBLOCK)
+ goto invalid_mapping;
+ if (off != irecs[i].br_startoff)
+ goto invalid_mapping;
+
+ map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+ map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+ off += irecs[i].br_blockcount;
+ }
+
+ if (off != bno + nfsb)
+ goto invalid_mapping;
+
+ *nmaps = nirecs;
+out_free_irecs:
+ if (irecs != &irec)
+ kmem_free(irecs);
+ return error;
+
+invalid_mapping:
+ /* Caller ok with no mapping. */
+ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ error = -EFSCORRUPTED;
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "%s: bno %u inode %llu",
+ __func__, bno, dp->i_ino);
+
+ for (i = 0; i < nirecs; i++) {
+ xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+ i, irecs[i].br_startoff,
+ irecs[i].br_startblock,
+ irecs[i].br_blockcount,
+ irecs[i].br_state);
+ }
+ }
+ } else {
+ *nmaps = 0;
+ }
+ goto out_free_irecs;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ struct xfs_buf_map map, *mapp = &map;
+ int nmap = 1;
+ int error;
+
+ *bpp = NULL;
+ error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap);
+ if (error || nmap == 0)
+ goto out_free;
+
+ error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp);
+ if (error)
+ goto out_free;
+
+ *bpp = bp;
+
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
+
+ return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags,
+ struct xfs_buf **bpp,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ struct xfs_buf_map map, *mapp = &map;
+ int nmap = 1;
+ int error;
+
+ *bpp = NULL;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
+ goto out_free;
+
+ error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
+ &bp, ops);
+ if (error)
+ goto out_free;
+
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+ *bpp = bp;
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
+
+ return error;
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+int
+xfs_da_reada_buf(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
+ goto out_free;
+
+ xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644
index 000000000..ffa3df5b2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+ unsigned int blksize; /* da block size in bytes */
+ unsigned int fsbcount; /* da block size in filesystem blocks */
+ uint8_t fsblog; /* log2 of _filesystem_ block size */
+ uint8_t blklog; /* log2 of da block size */
+ unsigned int node_hdr_size; /* danode header size in bytes */
+ unsigned int node_ents; /* # of entries in a danode */
+ unsigned int magicpct; /* 37% of block size in bytes */
+ xfs_dablk_t datablk; /* blockno of dir data v2 */
+ unsigned int leaf_hdr_size; /* dir2 leaf header size */
+ unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
+ xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ unsigned int free_hdr_size; /* dir2 free header size */
+ unsigned int free_max_bests; /* # of bests entries in dir2 free */
+ xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
+
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
+};
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+ XFS_CMP_DIFFERENT, /* names are completely different */
+ XFS_CMP_EXACT, /* names are exactly the same */
+ XFS_CMP_CASE /* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+ struct xfs_da_geometry *geo; /* da block geometry */
+ const uint8_t *name; /* string (maybe not NULL terminated) */
+ int namelen; /* length of string (maybe no NULL) */
+ uint8_t filetype; /* filetype of inode for directories */
+ void *value; /* set of bytes (maybe contain NULLs) */
+ int valuelen; /* length of value */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
+ xfs_dahash_t hashval; /* hash value of name */
+ xfs_ino_t inumber; /* input/output inode number */
+ struct xfs_inode *dp; /* directory inode to manipulate */
+ struct xfs_trans *trans; /* current trans (changes over time) */
+ xfs_extlen_t total; /* total blocks needed, for 1st bmap */
+ int whichfork; /* data or attribute fork */
+ xfs_dablk_t blkno; /* blkno of attr leaf of interest */
+ int index; /* index of attr of interest in blk */
+ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
+ int rmtblkcnt; /* remote attr value block count */
+ int rmtvaluelen; /* remote attr value length in bytes */
+ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
+ int index2; /* index of 2nd attr in blk */
+ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
+ int rmtblkcnt2; /* remote attr value block count */
+ int rmtvaluelen2; /* remote attr value length in bytes */
+ uint32_t op_flags; /* operation flags */
+ enum xfs_dacmp cmpresult; /* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
+
+#define XFS_DA_OP_FLAGS \
+ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
+ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
+ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
+ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \
+ { XFS_DA_OP_LOGGED, "LOGGED" }
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes. With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+ struct xfs_buf *bp; /* buffer containing block */
+ xfs_dablk_t blkno; /* filesystem blkno of buffer */
+ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
+ int index; /* relevant index into block */
+ xfs_dahash_t hashval; /* last hash value in block */
+ int magic; /* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+ int active; /* number of active levels */
+ xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+ xfs_da_args_t *args; /* filename arguments */
+ struct xfs_mount *mp; /* filesystem mount point */
+ xfs_da_state_path_t path; /* search/split paths */
+ xfs_da_state_path_t altpath; /* alternate path for join */
+ unsigned char inleaf; /* insert into 1->lf, 0->splf */
+ unsigned char extravalid; /* T/F: extrablk is in use */
+ unsigned char extraafter; /* T/F: extrablk is after new */
+ xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
+ /* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t level;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_da_node_entry *btree;
+};
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
+ (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+ int level, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int xfs_da3_join(xfs_da_state_t *state);
+void xfs_da3_fixhashpath(struct xfs_da_state *state,
+ struct xfs_da_state_path *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+ int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+ xfs_da_state_blk_t *new_blk);
+int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_daddr_t mappedbno, struct xfs_buf **bpp,
+ int whichfork);
+
+/*
+ * Utility routines.
+ */
+
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
+
+int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+ int count);
+int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+ xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
+int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
+ int whichfork, const struct xfs_buf_ops *ops);
+int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ unsigned int flags, int whichfork,
+ const struct xfs_buf_ops *ops);
+int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+ struct xfs_buf *dead_buf);
+
+uint xfs_da_hashname(const uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+
+struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
+void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
+
+void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
+void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
+ struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+
+extern struct kmem_cache *xfs_da_state_cache;
+
+#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644
index 000000000..25e284108
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -0,0 +1,805 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+ __be32 forw; /* previous block in list */
+ __be32 back; /* following block in list */
+ __be16 magic; /* validity check on block */
+ __be16 pad; /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+ /*
+ * the node link manipulation code relies on the fact that the first
+ * element of this structure is the struct xfs_da_blkinfo so it can
+ * ignore the differences in the rest of the structures.
+ */
+ struct xfs_da_blkinfo hdr;
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+ struct xfs_da_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+ struct xfs_da3_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+ __be32 __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+ struct xfs_da_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+ struct xfs_da3_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ * - shortform - embedded into the inode
+ * - single block - data with embedded leaf at the end
+ * - multiple data blocks, single leaf+freeindex block
+ * - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */
+#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */
+#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories. The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ * - a larger block header for CRC and identification purposes and so the
+ * offsets of all the structures inside the blocks are different.
+ *
+ * - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */
+#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */
+#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN 0
+#define XFS_DIR3_FT_REG_FILE 1
+#define XFS_DIR3_FT_DIR 2
+#define XFS_DIR3_FT_CHRDEV 3
+#define XFS_DIR3_FT_BLKDEV 4
+#define XFS_DIR3_FT_FIFO 5
+#define XFS_DIR3_FT_SOCK 6
+#define XFS_DIR3_FT_SYMLINK 7
+#define XFS_DIR3_FT_WHT 8
+
+#define XFS_DIR3_FT_MAX 9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef uint16_t xfs_dir2_data_off_t;
+#define NULLDATAOFF 0xffffU
+typedef uint xfs_dir2_data_aoff_t; /* argument form */
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef uint32_t xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef uint32_t xfs_dir2_db_t;
+
+#define XFS_INO32_SIZE 4
+#define XFS_INO64_SIZE 8
+#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
+
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode. These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures. Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+ uint8_t count; /* count of entries */
+ uint8_t i8count; /* count of 8-byte inode #s */
+ uint8_t parent[8]; /* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+ __u8 namelen; /* actual name length */
+ __u8 offset[2]; /* saved offset */
+ __u8 name[]; /* name, variable size */
+ /*
+ * A single byte containing the file type field follows the inode
+ * number for version 3 directory entries.
+ *
+ * A 64-bit or 32-bit inode number follows here, at a variable offset
+ * after the name.
+ */
+} __packed xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+ return sizeof(struct xfs_dir2_sf_hdr) -
+ (i8count == 0) * XFS_INO64_DIFF;
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+ return get_unaligned_be16(sfep->offset);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+ put_unaligned_be16(off, sfep->offset);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG 0xffff
+#define XFS_DIR2_DATA_FD_COUNT 3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define XFS_DIR2_MAX_SPACES 3
+#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE 0
+#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+ __be16 offset; /* start of freespace */
+ __be16 length; /* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+ __be32 magic; /* XFS_DIR2_DATA_MAGIC or */
+ /* XFS_DIR2_BLOCK_MAGIC */
+ xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
+ __be32 pad; /* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes. After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+ __be64 inumber; /* inode number */
+ __u8 namelen; /* name length */
+ __u8 name[]; /* name bytes, no null */
+ /* __u8 filetype; */ /* type of inode we point to */
+ /* __be16 tag; */ /* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+ __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
+ __be16 length; /* total free length */
+ /* variable offset */
+ __be16 tag; /* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+ return (__be16 *)((char *)dup +
+ be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ * +---------------------------+
+ * | xfs_dir2_leaf_hdr_t |
+ * +---------------------------+
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_leaf_tail_t |
+ * +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space. First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE 1
+#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+ xfs_da_blkinfo_t info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+ struct xfs_da3_blkinfo info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+ __be32 hashval; /* hash value of name */
+ __be32 address; /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+ __be32 bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+ xfs_dir2_leaf_hdr_t hdr; /* leaf header */
+ xfs_dir2_leaf_entry_t __ents[]; /* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+ struct xfs_dir3_leaf_hdr hdr; /* leaf header */
+ struct xfs_dir2_leaf_entry __ents[]; /* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+ return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block definitions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE 2
+#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef struct xfs_dir2_free_hdr {
+ __be32 magic; /* XFS_DIR2_FREE_MAGIC */
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+ xfs_dir2_free_hdr_t hdr; /* block header */
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+ struct xfs_dir3_free_hdr hdr;
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ * | ... |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_block_tail_t |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+ __be32 count; /* count of leaf entries */
+ __be32 stale; /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+ return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys. The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top. Name/values grow from the
+ * bottom but are not packed. The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped. When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again. If we
+ * still don't have enough space, then we have to split the block. The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string. The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard. If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry. The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry. It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set. We clear the
+ * bit when we have finished setting up the attribute. We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+struct xfs_attr_shortform {
+ struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ __u8 padding;
+ } hdr;
+ struct xfs_attr_sf_entry {
+ uint8_t namelen; /* actual length of name (no NULL) */
+ uint8_t valuelen; /* actual length of value (no NULL) */
+ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ uint8_t nameval[]; /* name & value bytes concatenated */
+ } list[1]; /* variable sized array */
+};
+
+typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
+ __be16 base; /* base of free region */
+ __be16 size; /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active leaf_entry's */
+ __be16 usedbytes; /* num bytes of names/values stored */
+ __be16 firstused; /* first used byte in name area */
+ __u8 holes; /* != 0 if blk needs compaction */
+ __u8 pad1;
+ xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+ /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
+ __be32 hashval; /* hash value of name */
+ __be16 nameidx; /* index into buffer of name/value */
+ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+ __u8 pad2; /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+ __be16 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 nameval[1]; /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+ __be32 valueblk; /* block number of value bytes */
+ __be32 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 name[1]; /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+ xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
+ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced and definining them can actually make gcc optimize away
+ * accesses to the 'entries' array above index 0 so don't do that.
+ *
+ * xfs_attr_leaf_name_local_t namelist;
+ * xfs_attr_leaf_name_remote_t valuelist;
+ */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+ struct xfs_da3_blkinfo info;
+ __be16 count;
+ __be16 usedbytes;
+ __be16 firstused;
+ __u8 holes;
+ __u8 pad1;
+ struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+ struct xfs_attr3_leaf_hdr hdr;
+ struct xfs_attr_leaf_entry entries[1];
+
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced, the locations accessed purely from helper functions.
+ *
+ * struct xfs_attr_leaf_name_local
+ * struct xfs_attr_leaf_name_remote
+ */
+};
+
+/*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF 0
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ */
+#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return sizeof(struct xfs_attr3_leaf_hdr);
+ return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+ return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+ return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+ nlen, XFS_ATTR_LEAF_NAME_ALIGN);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+ return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+ nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+ return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+
+struct xfs_attr3_rmt_hdr {
+ __be32 rm_magic;
+ __be32 rm_offset;
+ __be32 rm_bytes;
+ __be32 rm_crc;
+ uuid_t rm_uuid;
+ __be64 rm_owner;
+ __be64 rm_blkno;
+ __be64 rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
+ sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+/* Number of bytes in a directory block. */
+static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
+{
+ return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
+}
+
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
new file mode 100644
index 000000000..5a321b783
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+
+static struct kmem_cache *xfs_defer_pending_cache;
+
+/*
+ * Deferred Operations in XFS
+ *
+ * Due to the way locking rules work in XFS, certain transactions (block
+ * mapping and unmapping, typically) have permanent reservations so that
+ * we can roll the transaction to adhere to AG locking order rules and
+ * to unlock buffers between metadata updates. Prior to rmap/reflink,
+ * the mapping code had a mechanism to perform these deferrals for
+ * extents that were going to be freed; this code makes that facility
+ * more generic.
+ *
+ * When adding the reverse mapping and reflink features, it became
+ * necessary to perform complex remapping multi-transactions to comply
+ * with AG locking order rules, and to be able to spread a single
+ * refcount update operation (an operation on an n-block extent can
+ * update as many as n records!) among multiple transactions. XFS can
+ * roll a transaction to facilitate this, but using this facility
+ * requires us to log "intent" items in case log recovery needs to
+ * redo the operation, and to log "done" items to indicate that redo
+ * is not necessary.
+ *
+ * Deferred work is tracked in xfs_defer_pending items. Each pending
+ * item tracks one type of deferred work. Incoming work items (which
+ * have not yet had an intent logged) are attached to a pending item
+ * on the dop_intake list, where they wait for the caller to finish
+ * the deferred operations.
+ *
+ * Finishing a set of deferred operations is an involved process. To
+ * start, we define "rolling a deferred-op transaction" as follows:
+ *
+ * > For each xfs_defer_pending item on the dop_intake list,
+ * - Sort the work items in AG order. XFS locking
+ * order rules require us to lock buffers in AG order.
+ * - Create a log intent item for that type.
+ * - Attach it to the pending item.
+ * - Move the pending item from the dop_intake list to the
+ * dop_pending list.
+ * > Roll the transaction.
+ *
+ * NOTE: To avoid exceeding the transaction reservation, we limit the
+ * number of items that we attach to a given xfs_defer_pending.
+ *
+ * The actual finishing process looks like this:
+ *
+ * > For each xfs_defer_pending in the dop_pending list,
+ * - Roll the deferred-op transaction as above.
+ * - Create a log done item for that type, and attach it to the
+ * log intent item.
+ * - For each work item attached to the log intent item,
+ * * Perform the described action.
+ * * Attach the work item to the log done item.
+ * * If the result of doing the work was -EAGAIN, ->finish work
+ * wants a new transaction. See the "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" section below for
+ * details.
+ *
+ * The key here is that we must log an intent item for all pending
+ * work items every time we roll the transaction, and that we must log
+ * a done item as soon as the work is completed. With this mechanism
+ * we can perform complex remapping operations, chaining intent items
+ * as needed.
+ *
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation. The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off. The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish. Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work. Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue. Then it rolls the
+ * transaction and picks up processing where it left off. It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
+ * This is an example of remapping the extent (E, E+B) into file X at
+ * offset A and dealing with the extent (C, C+B) already being mapped
+ * there:
+ * +-------------------------------------------------+
+ * | Unmap file X startblock C offset A length B | t0
+ * | Intent to reduce refcount for extent (C, B) |
+ * | Intent to remove rmap (X, C, A, B) |
+ * | Intent to free extent (D, 1) (bmbt block) |
+ * | Intent to map (X, A, B) at startblock E |
+ * +-------------------------------------------------+
+ * | Map file X startblock E offset A length B | t1
+ * | Done mapping (X, E, A, B) |
+ * | Intent to increase refcount for extent (E, B) |
+ * | Intent to add rmap (X, E, A, B) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C, B) | t2
+ * | Done reducing refcount for extent (C, 9) |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9) | t3
+ * | Done reducing refcount for extent (C+9, B-9) |
+ * | Increase refcount for extent (E, B) |
+ * | Done increasing refcount for extent (E, B) |
+ * | Intent to free extent (C, B) |
+ * | Intent to free extent (F, 1) (refcountbt block) |
+ * | Intent to remove rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Remove rmap (X, C, A, B) | t4
+ * | Done removing rmap (X, C, A, B) |
+ * | Add rmap (X, E, A, B) |
+ * | Done adding rmap (X, E, A, B) |
+ * | Remove rmap (F, 1, REFC) |
+ * | Done removing rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Free extent (C, B) | t5
+ * | Done freeing extent (C, B) |
+ * | Free extent (D, 1) |
+ * | Done freeing extent (D, 1) |
+ * | Free extent (F, 1) |
+ * | Done freeing extent (F, 1) |
+ * +-------------------------------------------------+
+ *
+ * If we should crash before t2 commits, log recovery replays
+ * the following intent items:
+ *
+ * - Intent to reduce refcount for extent (C, B)
+ * - Intent to remove rmap (X, C, A, B)
+ * - Intent to free extent (D, 1) (bmbt block)
+ * - Intent to increase refcount for extent (E, B)
+ * - Intent to add rmap (X, E, A, B)
+ *
+ * In the process of recovering, it should also generate and take care
+ * of these intent items:
+ *
+ * - Intent to free extent (C, B)
+ * - Intent to free extent (F, 1) (refcountbt block)
+ * - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
+ */
+
+static const struct xfs_defer_op_type *defer_op_types[] = {
+ [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
+};
+
+/*
+ * Ensure there's a log intent item associated with this deferred work item if
+ * the operation must be restarted on crash. Returns 1 if there's a log item;
+ * 0 if there isn't; or a negative errno.
+ */
+static int
+xfs_defer_create_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp,
+ bool sort)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_log_item *lip;
+
+ if (dfp->dfp_intent)
+ return 1;
+
+ lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ if (!lip)
+ return 0;
+ if (IS_ERR(lip))
+ return PTR_ERR(lip);
+
+ dfp->dfp_intent = lip;
+ return 1;
+}
+
+/*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+ * the pending list.
+ *
+ * Returns 1 if at least one log item was associated with the deferred work;
+ * 0 if there are no log items; or a negative errno.
+ */
+static int
+xfs_defer_create_intents(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_pending *dfp;
+ int ret = 0;
+
+ list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+ int ret2;
+
+ trace_xfs_defer_create_intent(tp->t_mountp, dfp);
+ ret2 = xfs_defer_create_intent(tp, dfp, true);
+ if (ret2 < 0)
+ return ret2;
+ ret |= ret2;
+ }
+ return ret;
+}
+
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct list_head *dop_pending)
+{
+ struct xfs_defer_pending *dfp;
+ const struct xfs_defer_op_type *ops;
+
+ trace_xfs_defer_trans_abort(tp, _RET_IP_);
+
+ /* Abort intent items that don't have a done item. */
+ list_for_each_entry(dfp, dop_pending, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
+ trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ if (dfp->dfp_intent && !dfp->dfp_done) {
+ ops->abort_intent(dfp->dfp_intent);
+ dfp->dfp_intent = NULL;
+ }
+ }
+}
+
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits. Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+ struct xfs_defer_resources *dres,
+ struct xfs_trans *tp)
+{
+ struct xfs_buf_log_item *bli;
+ struct xfs_inode_log_item *ili;
+ struct xfs_log_item *lip;
+
+ BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
+
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
+ switch (lip->li_type) {
+ case XFS_LI_BUF:
+ bli = container_of(lip, struct xfs_buf_log_item,
+ bli_item);
+ if (bli->bli_flags & XFS_BLI_HOLD) {
+ if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ if (bli->bli_flags & XFS_BLI_ORDERED)
+ dres->dr_ordered |=
+ (1U << dres->dr_bufs);
+ else
+ xfs_trans_dirty_buf(tp, bli->bli_buf);
+ dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
+ }
+ break;
+ case XFS_LI_INODE:
+ ili = container_of(lip, struct xfs_inode_log_item,
+ ili_item);
+ if (ili->ili_lock_flags == 0) {
+ if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ xfs_trans_log_inode(tp, ili->ili_inode,
+ XFS_ILOG_CORE);
+ dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < dres->dr_inos; i++)
+ xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_trans_bjoin(tp, dres->dr_bp[i]);
+ if (dres->dr_ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+ xfs_trans_bhold(tp, dres->dr_bp[i]);
+ }
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tpp)
+{
+ struct xfs_defer_resources dres = { };
+ int error;
+
+ error = xfs_defer_save_resources(&dres, *tpp);
+ if (error)
+ return error;
+
+ trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
+
+ /*
+ * Roll the transaction. Rolling always given a new transaction (even
+ * if committing the old one fails!) to hand back to the caller, so we
+ * join the held resources to the new transaction so that we always
+ * return with the held resources joined to @tpp, no matter what
+ * happened.
+ */
+ error = xfs_trans_roll(tpp);
+
+ xfs_defer_restore_resources(*tpp, &dres);
+
+ if (error)
+ trace_xfs_defer_trans_roll_error(*tpp, error);
+ return error;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+static void
+xfs_defer_cancel_list(
+ struct xfs_mount *mp,
+ struct list_head *dop_list)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+ struct list_head *pwi;
+ struct list_head *n;
+ const struct xfs_defer_op_type *ops;
+
+ /*
+ * Free the pending items. Caller should already have arranged
+ * for the intent items to be released.
+ */
+ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
+ trace_xfs_defer_cancel_list(mp, dfp);
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ ops->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+ }
+}
+
+/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+static int
+xfs_defer_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_btree_cur *state = NULL;
+ struct list_head *li, *n;
+ int error;
+
+ trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
+
+ dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = ops->finish_item(tp, dfp->dfp_done, li, &state);
+ if (error == -EAGAIN) {
+ int ret;
+
+ /*
+ * Caller wants a fresh transaction; put the work item
+ * back on the list and log a new log intent item to
+ * replace the old one. See "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" above.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
+ ret = xfs_defer_create_intent(tp, dfp, false);
+ if (ret < 0)
+ error = ret;
+ }
+
+ if (error)
+ goto out;
+ }
+
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+out:
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(tp, state, error);
+ return error;
+}
+
+/*
+ * Finish all the pending work. This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+ * work items in the first item on the logged-and-pending list.
+ *
+ * If an inode is provided, relog it to the new transaction.
+ */
+int
+xfs_defer_finish_noroll(
+ struct xfs_trans **tp)
+{
+ struct xfs_defer_pending *dfp = NULL;
+ int error = 0;
+ LIST_HEAD(dop_pending);
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ trace_xfs_defer_finish(*tp, _RET_IP_);
+
+ /* Until we run out of pending work to finish... */
+ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+ /*
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
+ */
+ int has_intents = xfs_defer_create_intents(*tp);
+
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
+ if (has_intents < 0) {
+ error = has_intents;
+ goto out_shutdown;
+ }
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+ }
+
+ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ dfp_list);
+ error = xfs_defer_finish_one(*tp, dfp);
+ if (error && error != -EAGAIN)
+ goto out_shutdown;
+ }
+
+ trace_xfs_defer_finish_done(*tp, _RET_IP_);
+ return 0;
+
+out_shutdown:
+ xfs_defer_trans_abort(*tp, &dop_pending);
+ xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ trace_xfs_defer_finish_error(*tp, error);
+ xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+ xfs_defer_cancel(*tp);
+ return error;
+}
+
+int
+xfs_defer_finish(
+ struct xfs_trans **tp)
+{
+ int error;
+
+ /*
+ * Finish and roll the transaction once more to avoid returning to the
+ * caller with a dirty transaction.
+ */
+ error = xfs_defer_finish_noroll(tp);
+ if (error)
+ return error;
+ if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+ error = xfs_defer_trans_roll(tp);
+ if (error) {
+ xfs_force_shutdown((*tp)->t_mountp,
+ SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+ }
+
+ /* Reset LOWMODE now that we've finished all the dfops. */
+ ASSERT(list_empty(&(*tp)->t_dfops));
+ (*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
+ return 0;
+}
+
+void
+xfs_defer_cancel(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ trace_xfs_defer_cancel(tp, _RET_IP_);
+ xfs_defer_cancel_list(mp, &tp->t_dfops);
+}
+
+/* Add an item for later deferred processing. */
+void
+xfs_defer_add(
+ struct xfs_trans *tp,
+ enum xfs_defer_ops_type type,
+ struct list_head *li)
+{
+ struct xfs_defer_pending *dfp = NULL;
+ const struct xfs_defer_op_type *ops;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
+
+ /*
+ * Add the item to a pending item at the end of the intake list.
+ * If the last pending item has the same type, reuse it. Else,
+ * create a new pending item at the end of the intake list.
+ */
+ if (!list_empty(&tp->t_dfops)) {
+ dfp = list_last_entry(&tp->t_dfops,
+ struct xfs_defer_pending, dfp_list);
+ ops = defer_op_types[dfp->dfp_type];
+ if (dfp->dfp_type != type ||
+ (ops->max_items && dfp->dfp_count >= ops->max_items))
+ dfp = NULL;
+ }
+ if (!dfp) {
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ dfp->dfp_type = type;
+ dfp->dfp_intent = NULL;
+ dfp->dfp_done = NULL;
+ dfp->dfp_count = 0;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+ }
+
+ list_add_tail(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
+/*
+ * Move deferred ops from one transaction to another and reset the source to
+ * initial state. This is primarily used to carry state forward across
+ * transaction rolls with pending dfops.
+ */
+void
+xfs_defer_move(
+ struct xfs_trans *dtp,
+ struct xfs_trans *stp)
+{
+ list_splice_init(&stp->t_dfops, &dtp->t_dfops);
+
+ /*
+ * Low free space mode was historically controlled by a dfops field.
+ * This meant that low mode state potentially carried across multiple
+ * transaction rolls. Transfer low mode on a dfops move to preserve
+ * that behavior.
+ */
+ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+ stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_capture *dfc;
+ unsigned short i;
+ int error;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ error = xfs_defer_create_intents(tp);
+ if (error < 0)
+ return ERR_PTR(error);
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+ if (error) {
+ /*
+ * Resource capture should never fail, but if it does, we
+ * still have to shut down the log and release things
+ * properly.
+ */
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
+ /*
+ * Grab extra references to the inodes and buffers because callers are
+ * expected to release their held references after we commit the
+ * transaction.
+ */
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+ ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
+ }
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_capture_free(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ unsigned short i;
+
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+ xfs_irele(dfc->dfc_held.dr_ip[i]);
+
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp);
+ if (IS_ERR(dfc)) {
+ xfs_trans_cancel(tp);
+ return PTR_ERR(dfc);
+ }
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_capture_free(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned int i;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock the captured resources to the new transaction. */
+ if (dfc->dfc_held.dr_inos == 2)
+ xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+ dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+ else if (dfc->dfc_held.dr_inos == 1)
+ xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
+ xfs_defer_restore_resources(tp, &dfc->dfc_held);
+ memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+
+ kmem_free(dfc);
+}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ for (i = 0; i < dres->dr_inos; i++) {
+ xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+ xfs_irele(dres->dr_ip[i]);
+ dres->dr_ip[i] = NULL;
+ }
+
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_buf_relse(dres->dr_bp[i]);
+ dres->dr_bp[i] = NULL;
+ }
+
+ dres->dr_inos = 0;
+ dres->dr_bufs = 0;
+ dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+ xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+ sizeof(struct xfs_defer_pending),
+ 0, 0, NULL);
+
+ return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_defer_pending_cache);
+ xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+ int error;
+
+ error = xfs_defer_init_cache();
+ if (error)
+ return error;
+ error = xfs_rmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_refcount_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_bmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_extfree_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_attr_intent_init_cache();
+ if (error)
+ goto err;
+ return 0;
+err:
+ xfs_defer_destroy_item_caches();
+ return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+ xfs_attr_intent_destroy_cache();
+ xfs_extfree_intent_destroy_cache();
+ xfs_bmap_intent_destroy_cache();
+ xfs_refcount_intent_destroy_cache();
+ xfs_rmap_intent_destroy_cache();
+ xfs_defer_destroy_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
new file mode 100644
index 000000000..114a3a493
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_DEFER_H__
+#define __XFS_DEFER_H__
+
+struct xfs_btree_cur;
+struct xfs_defer_op_type;
+struct xfs_defer_capture;
+
+/*
+ * Header for deferred operation list.
+ */
+enum xfs_defer_ops_type {
+ XFS_DEFER_OPS_TYPE_BMAP,
+ XFS_DEFER_OPS_TYPE_REFCOUNT,
+ XFS_DEFER_OPS_TYPE_RMAP,
+ XFS_DEFER_OPS_TYPE_FREE,
+ XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
+ XFS_DEFER_OPS_TYPE_MAX,
+};
+
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+ struct list_head dfp_list; /* pending items */
+ struct list_head dfp_work; /* work items */
+ struct xfs_log_item *dfp_intent; /* log intent item */
+ struct xfs_log_item *dfp_done; /* log done item */
+ unsigned int dfp_count; /* # extent items */
+ enum xfs_defer_ops_type dfp_type;
+};
+
+void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
+ struct list_head *h);
+int xfs_defer_finish_noroll(struct xfs_trans **tp);
+int xfs_defer_finish(struct xfs_trans **tp);
+void xfs_defer_cancel(struct xfs_trans *);
+void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
+
+/* Description of a deferred type. */
+struct xfs_defer_op_type {
+ struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
+ struct list_head *items, unsigned int count, bool sort);
+ void (*abort_intent)(struct xfs_log_item *intent);
+ struct xfs_log_item *(*create_done)(struct xfs_trans *tp,
+ struct xfs_log_item *intent, unsigned int count);
+ int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done,
+ struct list_head *item, struct xfs_btree_cur **state);
+ void (*finish_cleanup)(struct xfs_trans *tp,
+ struct xfs_btree_cur *state, int error);
+ void (*cancel_item)(struct list_head *item);
+ unsigned int max_items;
+};
+
+extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
+
+/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+ /* held buffers */
+ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+ /* inodes with no unlock flags */
+ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+ /* number of held buffers */
+ unsigned short dr_bufs;
+
+ /* bitmap of ordered buffers */
+ unsigned short dr_ordered;
+
+ /* number of held inodes */
+ unsigned short dr_inos;
+};
+
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ struct xfs_defer_resources dfc_held;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
+
+#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000..92bac3373
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,769 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+const struct xfs_name xfs_name_dotdot = {
+ .name = (const unsigned char *)"..",
+ .len = 2,
+ .type = XFS_DIR3_FT_DIR,
+};
+
+/*
+ * Convert inode mode to directory entry filetype
+ */
+unsigned char
+xfs_mode_to_ftype(
+ int mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ return XFS_DIR3_FT_REG_FILE;
+ case S_IFDIR:
+ return XFS_DIR3_FT_DIR;
+ case S_IFCHR:
+ return XFS_DIR3_FT_CHRDEV;
+ case S_IFBLK:
+ return XFS_DIR3_FT_BLKDEV;
+ case S_IFIFO:
+ return XFS_DIR3_FT_FIFO;
+ case S_IFSOCK:
+ return XFS_DIR3_FT_SOCK;
+ case S_IFLNK:
+ return XFS_DIR3_FT_SYMLINK;
+ default:
+ return XFS_DIR3_FT_UNKNOWN;
+ }
+}
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+xfs_dahash_t
+xfs_ascii_ci_hashname(
+ const struct xfs_name *name)
+{
+ xfs_dahash_t hash;
+ int i;
+
+ for (i = 0, hash = 0; i < name->len; i++)
+ hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+ return hash;
+}
+
+enum xfs_dacmp
+xfs_ascii_ci_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ enum xfs_dacmp result;
+ int i;
+
+ if (args->namelen != len)
+ return XFS_CMP_DIFFERENT;
+
+ result = XFS_CMP_EXACT;
+ for (i = 0; i < len; i++) {
+ if (args->name[i] == name[i])
+ continue;
+ if (tolower(args->name[i]) != tolower(name[i]))
+ return XFS_CMP_DIFFERENT;
+ result = XFS_CMP_CASE;
+ }
+
+ return result;
+}
+
+int
+xfs_da_mount(
+ struct xfs_mount *mp)
+{
+ struct xfs_da_geometry *dageo;
+
+
+ ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+ ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
+
+ mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_MAYFAIL);
+ mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_MAYFAIL);
+ if (!mp->m_dir_geo || !mp->m_attr_geo) {
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+ return -ENOMEM;
+ }
+
+ /* set up directory geometry */
+ dageo = mp->m_dir_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
+ dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+ if (xfs_has_crc(mp)) {
+ dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir3_data_hdr);
+ } else {
+ dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir2_data_hdr);
+ }
+ dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) /
+ sizeof(struct xfs_dir2_leaf_entry);
+ dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) /
+ sizeof(xfs_dir2_data_off_t);
+
+ dageo->data_first_offset = dageo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, 1) +
+ xfs_dir2_data_entsize(mp, 2);
+
+ /*
+ * Now we've set up the block conversion variables, we can calculate the
+ * segment block constants using the geometry structure.
+ */
+ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+ dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+ dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ /* set up attribute geometry - single fsb only */
+ dageo = mp->m_attr_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1;
+ dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+ return 0;
+}
+
+void
+xfs_da_unmount(
+ struct xfs_mount *mp)
+{
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+ xfs_inode_t *dp)
+{
+ xfs_dir2_sf_hdr_t *sfp;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ if (dp->i_disk_size == 0) /* might happen during shutdown. */
+ return 1;
+ if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
+ return 0;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+ xfs_mount_t *mp,
+ xfs_ino_t ino)
+{
+ bool ino_ok = xfs_verify_dir_ino(mp, ino);
+
+ if (XFS_IS_CORRUPT(mp, !ino_ok) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+ xfs_warn(mp, "Invalid inode number 0x%Lx",
+ (unsigned long long) ino);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ xfs_inode_t *pdp)
+{
+ struct xfs_da_args *args;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+ if (error)
+ return error;
+
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ if (!args)
+ return -ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->dp = dp;
+ args->trans = tp;
+ error = xfs_dir2_sf_create(args, pdp->i_ino);
+ kmem_free(args);
+ return error;
+}
+
+/*
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
+ */
+int
+xfs_dir_createname(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t inum, /* new entry inode number */
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ struct xfs_da_args *args;
+ int rval;
+ bool v;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ if (inum) {
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
+ return rval;
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
+ }
+
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ if (!args)
+ return -ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ if (!inum)
+ args->op_flags |= XFS_DA_OP_JUSTCHECK;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
+ else
+ rval = xfs_dir2_node_addname(args);
+
+out_free:
+ kmem_free(args);
+ return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (args->cmpresult == XFS_CMP_DIFFERENT)
+ return -ENOENT;
+ if (args->cmpresult != XFS_CMP_CASE ||
+ !(args->op_flags & XFS_DA_OP_CILOOKUP))
+ return -EEXIST;
+
+ args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ if (!args->value)
+ return -ENOMEM;
+
+ memcpy(args->value, name, len);
+ args->valuelen = len;
+ return -EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
+{
+ struct xfs_da_args *args;
+ int rval;
+ bool v;
+ int lock_mode;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
+
+ /*
+ * We need to use KM_NOFS here so that lockdep will not throw false
+ * positive deadlock warnings on a non-transactional lookup path. It is
+ * safe to recurse into inode recalim in that case, but lockdep can't
+ * easily be taught about it. Hence KM_NOFS avoids having to add more
+ * lockdep Doing this avoids having to add a bunch of lockdep class
+ * annotations into the reclaim path for the ilock.
+ */
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
+ if (ci_name)
+ args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_lookup(args);
+ else
+ rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
+ if (rval == -EEXIST)
+ rval = 0;
+ if (!rval) {
+ *inum = args->inumber;
+ if (ci_name) {
+ ci_name->name = args->value;
+ ci_name->len = args->valuelen;
+ }
+ }
+out_free:
+ xfs_iunlock(dp, lock_mode);
+ kmem_free(args);
+ return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_name *name,
+ xfs_ino_t ino,
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ struct xfs_da_args *args;
+ int rval;
+ bool v;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
+
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ if (!args)
+ return -ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+ args->inumber = ino;
+ args->dp = dp;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_removename(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_removename(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_removename(args);
+ else
+ rval = xfs_dir2_node_removename(args);
+out_free:
+ kmem_free(args);
+ return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name, /* name of entry to replace */
+ xfs_ino_t inum, /* new inode number */
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ struct xfs_da_args *args;
+ int rval;
+ bool v;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
+ return rval;
+
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ if (!args)
+ return -ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_replace(args);
+ else
+ rval = xfs_dir2_node_replace(args);
+out_free:
+ kmem_free(args);
+ return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+int
+xfs_dir_canenter(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name) /* name of entry to add */
+{
+ return xfs_dir_createname(tp, dp, name, 0, 0);
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+ struct xfs_da_args *args,
+ int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
+ xfs_dir2_db_t *dbp) /* out: block number added */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ xfs_fileoff_t bno; /* directory offset of new block */
+ int count; /* count of filesystem blocks */
+ int error;
+
+ trace_xfs_dir2_grow_inode(args, space);
+
+ /*
+ * Set lowest possible block in the space requested.
+ */
+ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+ count = args->geo->fsbcount;
+
+ error = xfs_da_grow_inode_int(args, &bno, count);
+ if (error)
+ return error;
+
+ *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
+ /*
+ * Update file's size if this is the data space and it grew.
+ */
+ if (space == XFS_DIR2_DATA_SPACE) {
+ xfs_fsize_t size; /* directory file (data) size */
+
+ size = XFS_FSB_TO_B(mp, bno + count);
+ if (size > dp->i_disk_size) {
+ dp->i_disk_size = size;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ }
+ }
+ return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+ struct xfs_da_args *args,
+ bool *isblock)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_fileoff_t eof;
+ int error;
+
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isblock = false;
+ if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
+ return 0;
+
+ *isblock = true;
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+ return -EFSCORRUPTED;
+ return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+ struct xfs_da_args *args,
+ bool *isleaf)
+{
+ xfs_fileoff_t eof;
+ int error;
+
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isleaf = false;
+ if (eof != args->geo->leafblk + args->geo->fsbcount)
+ return 0;
+
+ *isleaf = true;
+ return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+ struct xfs_da_args *args,
+ xfs_dir2_db_t db,
+ struct xfs_buf *bp)
+{
+ xfs_fileoff_t bno; /* directory file offset */
+ xfs_dablk_t da; /* directory file offset */
+ int done; /* bunmap is finished */
+ struct xfs_inode *dp;
+ int error;
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ trace_xfs_dir2_shrink_inode(args, db);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ da = xfs_dir2_db_to_da(args->geo, db);
+
+ /* Unmap the fsblock(s). */
+ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, &done);
+ if (error) {
+ /*
+ * ENOSPC actually can happen if we're in a removename with no
+ * space reservation, and the resulting block removal would
+ * cause a bmap btree split or conversion from extents to btree.
+ * This can only happen for un-fragmented directory blocks,
+ * since you need to be punching out the middle of an extent.
+ * In this case we need to leave the block in the file, and not
+ * binval it. So the block has to be in a consistent empty
+ * state and appropriately logged. We don't free up the buffer,
+ * the caller can tell it hasn't happened since it got an error
+ * back.
+ */
+ return error;
+ }
+ ASSERT(done);
+ /*
+ * Invalidate the buffer from the transaction.
+ */
+ xfs_trans_binval(tp, bp);
+ /*
+ * If it's not a data block, we're done.
+ */
+ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+ return 0;
+ /*
+ * If the block isn't the last one in the directory, we're done.
+ */
+ if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+ return 0;
+ bno = da;
+ if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+ /*
+ * This can't really happen unless there's kernel corruption.
+ */
+ return error;
+ }
+ if (db == args->geo->datablk)
+ ASSERT(bno == 0);
+ else
+ ASSERT(bno > 0);
+ /*
+ * Set the size to the new last block.
+ */
+ dp->i_disk_size = XFS_FSB_TO_B(mp, bno);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ return 0;
+}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
+
+xfs_dahash_t
+xfs_dir2_hashname(
+ struct xfs_mount *mp,
+ const struct xfs_name *name)
+{
+ if (unlikely(xfs_has_asciici(mp)))
+ return xfs_ascii_ci_hashname(name);
+ return xfs_da_hashname(name->name, name->len);
+}
+
+enum xfs_dacmp
+xfs_dir2_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (unlikely(xfs_has_asciici(args->dp->i_mount)))
+ return xfs_ascii_ci_compname(args, name, len);
+ return xfs_da_compname(args, name, len);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644
index 000000000..dd39f17dd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
+struct xfs_dir3_icfree_hdr;
+struct xfs_dir3_icleaf_hdr;
+
+extern const struct xfs_name xfs_name_dotdot;
+
+/*
+ * Convert inode mode to directory entry filetype
+ */
+extern unsigned char xfs_mode_to_ftype(int mode);
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+ const struct xfs_name *name, xfs_ino_t inum,
+ xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+ const struct xfs_name *name, xfs_ino_t *inum,
+ struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t ino,
+ xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+ const struct xfs_name *name, xfs_ino_t inum,
+ xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name);
+
+/*
+ * Direct call from the bmap code, bypassing the generic directory layer.
+ */
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+ struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern int xfs_dir2_data_use_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+ xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+ int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+ struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup);
+
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem. With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size. For now we use the current glibc buffer size.
+ * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE.
+ */
+#define XFS_READDIR_BUFSIZE (32768)
+
+unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
+unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
+ struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
+
+#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000..00f960a70
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1275 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+ int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+ int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+ xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+ xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static xfs_failaddr_t
+xfs_dir3_block_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+ return __xfs_dir3_data_check(NULL, bp);
+}
+
+static void
+xfs_dir3_block_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_dir3_block_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_dir3_block_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_block_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
+ .verify_read = xfs_dir3_block_read_verify,
+ .verify_write = xfs_dir3_block_write_verify,
+ .verify_struct = xfs_dir3_block_verify,
+};
+
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+int
+xfs_dir3_block_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
+ XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_block_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ return err;
+}
+
+static void
+xfs_dir3_block_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
+{
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+ if (xfs_has_crc(mp)) {
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+ return;
+
+ }
+ hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ __be16 **tagpp,
+ struct xfs_dir2_data_unused **dupp,
+ struct xfs_dir2_data_unused **enddupp,
+ int *compact,
+ int len)
+{
+ struct xfs_dir2_data_free *bf;
+ __be16 *tagp = NULL;
+ struct xfs_dir2_data_unused *dup = NULL;
+ struct xfs_dir2_data_unused *enddup = NULL;
+
+ *compact = 0;
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+ /*
+ * If there are stale entries we'll use one for the leaf.
+ */
+ if (btp->stale) {
+ if (be16_to_cpu(bf[0].length) >= len) {
+ /*
+ * The biggest entry enough to avoid compaction.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ goto out;
+ }
+
+ /*
+ * Will need to compact to make this work.
+ * Tag just before the first leaf entry.
+ */
+ *compact = 1;
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then the data will go where the
+ * leaf data starts now, if it works at all.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+ (uint)sizeof(*blp) < len)
+ dup = NULL;
+ } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+ dup = NULL;
+ else
+ dup = (xfs_dir2_data_unused_t *)blp;
+ goto out;
+ }
+
+ /*
+ * no stale entries, so just use free space.
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then can't do this add without cleaning up:
+ * the space before the first leaf entry needs to be free so it
+ * can be expanded to hold the pointer to the new entry.
+ */
+ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ /*
+ * Check out the biggest freespace and see if it's the same one.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ if (dup != enddup) {
+ /*
+ * Not the same free entry, just check its length.
+ */
+ if (be16_to_cpu(dup->length) < len)
+ dup = NULL;
+ goto out;
+ }
+
+ /*
+ * It is the biggest freespace, can it hold the leaf too?
+ */
+ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+ /*
+ * Yes, use the second-largest entry instead if it works.
+ */
+ if (be16_to_cpu(bf[1].length) >= len)
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[1].offset));
+ else
+ dup = NULL;
+ }
+ }
+out:
+ *tagpp = tagp;
+ *dupp = dup;
+ *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ int *needlog,
+ int *lfloghigh,
+ int *lfloglow)
+{
+ int fromidx; /* source leaf index */
+ int toidx; /* target leaf index */
+ int needscan = 0;
+ int highstale; /* high stale index */
+
+ fromidx = toidx = be32_to_cpu(btp->count) - 1;
+ highstale = *lfloghigh = -1;
+ for (; fromidx >= 0; fromidx--) {
+ if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ if (highstale == -1)
+ highstale = toidx;
+ else {
+ if (*lfloghigh == -1)
+ *lfloghigh = toidx;
+ continue;
+ }
+ }
+ if (fromidx < toidx)
+ blp[toidx] = blp[fromidx];
+ toidx--;
+ }
+ *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+ *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+ be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+ xfs_dir2_data_make_free(args, bp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+ needlog, &needscan);
+ btp->stale = cpu_to_be32(1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int /* error */
+xfs_dir2_block_addname(
+ xfs_da_args_t *args) /* directory op arguments */
+{
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ struct xfs_buf *bp; /* buffer for block */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ int compact; /* need to compact leaf ents */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* directory inode */
+ xfs_dir2_data_unused_t *dup; /* block unused entry */
+ int error; /* error return value */
+ xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */
+ xfs_dahash_t hash; /* hash value of found entry */
+ int high; /* high index for binary srch */
+ int highstale; /* high stale index */
+ int lfloghigh=0; /* last final leaf to log */
+ int lfloglow=0; /* first final leaf to log */
+ int len; /* length of the new entry */
+ int low; /* low index for binary srch */
+ int lowstale; /* low stale index */
+ int mid=0; /* midpoint for binary srch */
+ int needlog; /* need to log header */
+ int needscan; /* need to rescan freespace */
+ __be16 *tagp; /* pointer to tag value */
+ xfs_trans_t *tp; /* transaction structure */
+
+ trace_xfs_dir2_block_addname(args);
+
+ dp = args->dp;
+ tp = args->trans;
+
+ /* Read the (one and only) directory block into bp. */
+ error = xfs_dir3_block_read(tp, dp, &bp);
+ if (error)
+ return error;
+
+ len = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+
+ /*
+ * Set up pointers to parts of the block.
+ */
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Find out if we can reuse stale entries or whether we need extra
+ * space for entry and new leaf.
+ */
+ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+ &enddup, &compact, len);
+
+ /*
+ * Done everything we need for a space check now.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_trans_brelse(tp, bp);
+ if (!dup)
+ return -ENOSPC;
+ return 0;
+ }
+
+ /*
+ * If we don't have space for the new entry & leaf ...
+ */
+ if (!dup) {
+ /* Don't have a space reservation: return no-space. */
+ if (args->total == 0)
+ return -ENOSPC;
+ /*
+ * Convert to the next larger format.
+ * Then add the new entry in that format.
+ */
+ error = xfs_dir2_block_to_leaf(args, bp);
+ if (error)
+ return error;
+ return xfs_dir2_leaf_addname(args);
+ }
+
+ needlog = needscan = 0;
+
+ /*
+ * If need to compact the leaf entries, do it now.
+ */
+ if (compact) {
+ xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+ &lfloghigh, &lfloglow);
+ /* recalculate blp post-compaction */
+ blp = xfs_dir2_block_leaf_p(btp);
+ } else if (btp->stale) {
+ /*
+ * Set leaf logging boundaries to impossible state.
+ * For the no-stale case they're set explicitly.
+ */
+ lfloglow = be32_to_cpu(btp->count);
+ lfloghigh = -1;
+ }
+
+ /*
+ * Find the slot that's first lower than our hash value, -1 if none.
+ */
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+ break;
+ if (hash < args->hashval)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+ while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+ mid--;
+ }
+ /*
+ * No stale entries, will use enddup space to hold new leaf.
+ */
+ if (!btp->stale) {
+ xfs_dir2_data_aoff_t aoff;
+
+ /*
+ * Mark the space needed for the new leaf entry, now in use.
+ */
+ aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr +
+ be16_to_cpu(enddup->length) - sizeof(*blp));
+ error = xfs_dir2_data_use_free(args, bp, enddup, aoff,
+ (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog,
+ &needscan);
+ if (error)
+ return error;
+
+ /*
+ * Update the tail (entry count).
+ */
+ be32_add_cpu(&btp->count, 1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan) {
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ needscan = 0;
+ }
+ /*
+ * Adjust pointer to the first leaf entry, we're about to move
+ * the table up one to open up space for the new leaf entry.
+ * Then adjust our index to match.
+ */
+ blp--;
+ mid++;
+ if (mid)
+ memmove(blp, &blp[1], mid * sizeof(*blp));
+ lfloglow = 0;
+ lfloghigh = mid;
+ }
+ /*
+ * Use a stale leaf for our new entry.
+ */
+ else {
+ for (lowstale = mid;
+ lowstale >= 0 &&
+ blp[lowstale].address !=
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ lowstale--)
+ continue;
+ for (highstale = mid + 1;
+ highstale < be32_to_cpu(btp->count) &&
+ blp[highstale].address !=
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+ (lowstale < 0 || mid - lowstale > highstale - mid);
+ highstale++)
+ continue;
+ /*
+ * Move entries toward the low-numbered stale entry.
+ */
+ if (lowstale >= 0 &&
+ (highstale == be32_to_cpu(btp->count) ||
+ mid - lowstale <= highstale - mid)) {
+ if (mid - lowstale)
+ memmove(&blp[lowstale], &blp[lowstale + 1],
+ (mid - lowstale) * sizeof(*blp));
+ lfloglow = min(lowstale, lfloglow);
+ lfloghigh = max(mid, lfloghigh);
+ }
+ /*
+ * Move entries toward the high-numbered stale entry.
+ */
+ else {
+ ASSERT(highstale < be32_to_cpu(btp->count));
+ mid++;
+ if (highstale - mid)
+ memmove(&blp[mid + 1], &blp[mid],
+ (highstale - mid) * sizeof(*blp));
+ lfloglow = min(mid, lfloglow);
+ lfloghigh = max(highstale, lfloghigh);
+ }
+ be32_add_cpu(&btp->stale, -1);
+ }
+ /*
+ * Point to the new data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ /*
+ * Fill in the leaf entry.
+ */
+ blp[mid].hashval = cpu_to_be32(args->hashval);
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+ (char *)dep - (char *)hdr));
+ xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+ /*
+ * Mark space for the data entry used.
+ */
+ error = xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+ if (error)
+ return error;
+ /*
+ * Create the new data entry.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, args->namelen);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ /*
+ * Clean up the bestfree array and log the header, tail, and entry.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_block_log_tail(tp, bp);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
+ return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+ xfs_trans_t *tp, /* transaction structure */
+ struct xfs_buf *bp, /* block buffer */
+ int first, /* index of first logged leaf */
+ int last) /* index of last logged leaf */
+{
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ xfs_dir2_leaf_entry_t *blp;
+ xfs_dir2_block_tail_t *btp;
+
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+ (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+ xfs_trans_t *tp, /* transaction structure */
+ struct xfs_buf *bp) /* block buffer */
+{
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ xfs_dir2_block_tail_t *btp;
+
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+ xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+ (uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+
+/*
+ * Look up an entry in the block. This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int /* error */
+xfs_dir2_block_lookup(
+ xfs_da_args_t *args) /* dir lookup arguments */
+{
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* entry index */
+ int error; /* error return value */
+
+ trace_xfs_dir2_block_lookup(args);
+
+ /*
+ * Get the buffer, look up the entry.
+ * If not found (ENOENT) then return, have no buffer.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+ return error;
+ dp = args->dp;
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Get the offset from the leaf entry, to point to the data.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
+ /*
+ * Fill in inode number, CI name if appropriate, release the block.
+ */
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_trans_brelse(args->trans, bp);
+ return error;
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int /* error */
+xfs_dir2_block_lookup_int(
+ xfs_da_args_t *args, /* dir lookup arguments */
+ struct xfs_buf **bpp, /* returned block buffer */
+ int *entno) /* returned entry number */
+{
+ xfs_dir2_dataptr_t addr; /* data entry address */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int error; /* error return value */
+ xfs_dahash_t hash; /* found hash value */
+ int high; /* binary search high index */
+ int low; /* binary search low index */
+ int mid; /* binary search current idx */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+
+ dp = args->dp;
+ tp = args->trans;
+
+ error = xfs_dir3_block_read(tp, dp, &bp);
+ if (error)
+ return error;
+
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Loop doing a binary search for our hash value.
+ * Find our entry, ENOENT if it's not there.
+ */
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+ ASSERT(low <= high);
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+ break;
+ if (hash < args->hashval)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ if (low > high) {
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ xfs_trans_brelse(tp, bp);
+ return -ENOENT;
+ }
+ }
+ /*
+ * Back up to the first one with the right hash value.
+ */
+ while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+ mid--;
+ }
+ /*
+ * Now loop forward through all the entries with the
+ * right hash value looking for our name.
+ */
+ do {
+ if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Get pointer to the entry from the leaf.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+ /*
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
+ */
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ *bpp = bp;
+ *entno = mid;
+ if (cmp == XFS_CMP_EXACT)
+ return 0;
+ }
+ } while (++mid < be32_to_cpu(btp->count) &&
+ be32_to_cpu(blp[mid].hashval) == hash);
+
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was found earlier, return success.
+ */
+ if (args->cmpresult == XFS_CMP_CASE)
+ return 0;
+ /*
+ * No match, release the buffer and return ENOENT.
+ */
+ xfs_trans_brelse(tp, bp);
+ return -ENOENT;
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int /* error */
+xfs_dir2_block_removename(
+ xfs_da_args_t *args) /* directory operation args */
+{
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* block leaf entry index */
+ int error; /* error return value */
+ int needlog; /* need to log block header */
+ int needscan; /* need to fixup bestfree */
+ xfs_dir2_sf_hdr_t sfh; /* shortform header */
+ int size; /* shortform size */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_block_removename(args);
+
+ /*
+ * Look up the entry in the block. Gets the buffer and entry index.
+ * It will always be there, the vnodeops level does a lookup first.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+ return error;
+ }
+ dp = args->dp;
+ tp = args->trans;
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Point to the data entry using the leaf entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
+ /*
+ * Mark the data entry's space free.
+ */
+ needlog = needscan = 0;
+ xfs_dir2_data_make_free(args, bp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
+ /*
+ * Fix up the block tail.
+ */
+ be32_add_cpu(&btp->stale, 1);
+ xfs_dir2_block_log_tail(tp, bp);
+ /*
+ * Remove the leaf entry by marking it stale.
+ */
+ blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+ /*
+ * Fix up bestfree, log the header if necessary.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir3_data_check(dp, bp);
+ /*
+ * See if the size as a shortform is good enough.
+ */
+ size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+ if (size > xfs_inode_data_fork_size(dp))
+ return 0;
+
+ /*
+ * If it works, do the conversion.
+ */
+ return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int /* error */
+xfs_dir2_block_replace(
+ xfs_da_args_t *args) /* directory operation args */
+{
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* leaf entry index */
+ int error; /* error return value */
+
+ trace_xfs_dir2_block_replace(args);
+
+ /*
+ * Lookup the entry in the directory. Get buffer and entry index.
+ * This will always succeed since the caller has already done a lookup.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+ return error;
+ }
+ dp = args->dp;
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Point to the data entry we need to change.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
+ ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+ /*
+ * Change the inode number to the new value.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
+ return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int /* sort order */
+xfs_dir2_block_sort(
+ const void *a, /* first leaf entry */
+ const void *b) /* second leaf entry */
+{
+ const xfs_dir2_leaf_entry_t *la; /* first leaf entry */
+ const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */
+
+ la = a;
+ lb = b;
+ return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+ (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int /* error */
+xfs_dir2_leaf_to_block(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *lbp, /* leaf buffer */
+ struct xfs_buf *dbp) /* data buffer */
+{
+ __be16 *bestsp; /* leaf bests table */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* unused data entry */
+ int error; /* error return value */
+ int from; /* leaf from index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* file system mount point */
+ int needlog; /* need to log data header */
+ int needscan; /* need to scan for bestfree */
+ xfs_dir2_sf_hdr_t sfh; /* shortform header */
+ int size; /* bytes used */
+ __be16 *tagp; /* end of entry (tag) */
+ int to; /* block/leaf to index */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_to_block(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = lbp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+ /*
+ * If there are data blocks other than the first one, take this
+ * opportunity to remove trailing empty data blocks that may have
+ * been left behind during no-space-reservation operations.
+ * These will show up in the leaf bests table.
+ */
+ while (dp->i_disk_size > args->geo->blksize) {
+ int hdrsz;
+
+ hdrsz = args->geo->data_entry_offset;
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+ args->geo->blksize - hdrsz) {
+ if ((error =
+ xfs_dir2_leaf_trim_data(args, lbp,
+ (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+ return error;
+ } else
+ return 0;
+ }
+ /*
+ * Read the data block if we don't already have it, give up if it fails.
+ */
+ if (!dbp) {
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
+ if (error)
+ return error;
+ }
+ hdr = dbp->b_addr;
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
+ /*
+ * Size of the "leaf" area in the block.
+ */
+ size = (uint)sizeof(xfs_dir2_block_tail_t) +
+ (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+ /*
+ * Look at the last data entry.
+ */
+ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+ /*
+ * If it's not free or is too short we can't do it.
+ */
+ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+ be16_to_cpu(dup->length) < size)
+ return 0;
+
+ /*
+ * Start converting it to block form.
+ */
+ xfs_dir3_block_init(mp, tp, dbp, dp);
+
+ needlog = 1;
+ needscan = 0;
+ /*
+ * Use up the space at the end of the block (blp/btp).
+ */
+ error = xfs_dir2_data_use_free(args, dbp, dup,
+ args->geo->blksize - size, size, &needlog, &needscan);
+ if (error)
+ return error;
+ /*
+ * Initialize the block tail.
+ */
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+ btp->stale = 0;
+ xfs_dir2_block_log_tail(tp, dbp);
+ /*
+ * Initialize the block leaf area. We compact out stale entries.
+ */
+ lep = xfs_dir2_block_leaf_p(btp);
+ for (from = to = 0; from < leafhdr.count; from++) {
+ if (leafhdr.ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ continue;
+ lep[to++] = leafhdr.ents[from];
+ }
+ ASSERT(to == be32_to_cpu(btp->count));
+ xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+ /*
+ * Scan the bestfree if we need it and log the data block header.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+ /*
+ * Pitch the old leaf block.
+ */
+ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+ if (error)
+ return error;
+
+ /*
+ * Now see if the resulting block can be shrunken to shortform.
+ */
+ size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+ if (size > xfs_inode_data_fork_size(dp))
+ return 0;
+
+ return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int /* error */
+xfs_dir2_sf_to_block(
+ struct xfs_da_args *args)
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+ struct xfs_da_geometry *geo = args->geo;
+ xfs_dir2_db_t blkno; /* dir-relative block # (0) */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail pointer */
+ xfs_dir2_data_entry_t *dep; /* data entry pointer */
+ int dummy; /* trash */
+ xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ int endoffset; /* end of data objects */
+ int error; /* error return value */
+ int i; /* index */
+ int needlog; /* need to log block header */
+ int needscan; /* need to scan block freespc */
+ int newoffset; /* offset from current entry */
+ unsigned int offset = geo->data_entry_offset;
+ xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform header */
+ __be16 *tagp; /* end of data entry */
+ struct xfs_name name;
+
+ trace_xfs_dir2_sf_to_block(args);
+
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+
+ oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+ ASSERT(ifp->if_bytes == dp->i_disk_size);
+ ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_df.if_nextents == 0);
+
+ /*
+ * Copy the directory into a temporary buffer.
+ * Then pitch the incore inode data so we can make extents.
+ */
+ sfp = kmem_alloc(ifp->if_bytes, 0);
+ memcpy(sfp, oldsfp, ifp->if_bytes);
+
+ xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
+ dp->i_disk_size = 0;
+
+ /*
+ * Add block 0 to the inode.
+ */
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+ if (error)
+ goto out_free;
+ /*
+ * Initialize the data block, then convert it to block format.
+ */
+ error = xfs_dir3_data_init(args, blkno, &bp);
+ if (error)
+ goto out_free;
+ xfs_dir3_block_init(mp, tp, bp, dp);
+ hdr = bp->b_addr;
+
+ /*
+ * Compute size of block "tail" area.
+ */
+ i = (uint)sizeof(*btp) +
+ (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+ /*
+ * The whole thing is initialized to free by the init routine.
+ * Say we're using the leaf and tail area.
+ */
+ dup = bp->b_addr + offset;
+ needlog = needscan = 0;
+ error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
+ if (error)
+ goto out_free;
+ ASSERT(needscan == 0);
+ /*
+ * Fill in the tail.
+ */
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
+ btp->stale = 0;
+ blp = xfs_dir2_block_leaf_p(btp);
+ endoffset = (uint)((char *)blp - (char *)hdr);
+ /*
+ * Remove the freespace, we'll manage it.
+ */
+ error = xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ be16_to_cpu(dup->length), &needlog, &needscan);
+ if (error)
+ goto out_free;
+
+ /*
+ * Create entry for .
+ */
+ dep = bp->b_addr + offset;
+ dep->inumber = cpu_to_be64(dp->i_ino);
+ dep->namelen = 1;
+ dep->name[0] = '.';
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
+ /*
+ * Create entry for ..
+ */
+ dep = bp->b_addr + offset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
+ dep->namelen = 2;
+ dep->name[0] = dep->name[1] = '.';
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
+ /*
+ * Loop over existing entries, stuff them in.
+ */
+ i = 0;
+ if (!sfp->count)
+ sfep = NULL;
+ else
+ sfep = xfs_dir2_sf_firstentry(sfp);
+
+ /*
+ * Need to preserve the existing offset values in the sf directory.
+ * Insert holes (unused entries) where necessary.
+ */
+ while (offset < endoffset) {
+ /*
+ * sfep is null when we reach the end of the list.
+ */
+ if (sfep == NULL)
+ newoffset = endoffset;
+ else
+ newoffset = xfs_dir2_sf_get_offset(sfep);
+ /*
+ * There should be a hole here, make one.
+ */
+ if (offset < newoffset) {
+ dup = bp->b_addr + offset;
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ dup->length = cpu_to_be16(newoffset - offset);
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset);
+ xfs_dir2_data_log_unused(args, bp, dup);
+ xfs_dir2_data_freeinsert(hdr,
+ xfs_dir2_data_bestfree_p(mp, hdr),
+ dup, &dummy);
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+ /*
+ * Copy a real entry.
+ */
+ dep = bp->b_addr + newoffset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep));
+ dep->namelen = sfep->namelen;
+ xfs_dir2_data_put_ftype(mp, dep,
+ xfs_dir2_sf_get_ftype(mp, sfep));
+ memcpy(dep->name, sfep->name, dep->namelen);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(newoffset);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ name.name = sfep->name;
+ name.len = sfep->namelen;
+ blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name));
+ blp[2 + i].address =
+ cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset));
+ offset = (int)((char *)(tagp + 1) - (char *)hdr);
+ if (++i == sfp->count)
+ sfep = NULL;
+ else
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ }
+ /* Done with the temporary buffer */
+ kmem_free(sfp);
+ /*
+ * Sort the leaf entries by hash value.
+ */
+ xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+ /*
+ * Log the leaf entry area and tail.
+ * Already logged the header in data_init, ignore needlog.
+ */
+ ASSERT(needscan == 0);
+ xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+ xfs_dir2_block_log_tail(tp, bp);
+ xfs_dir3_data_check(dp, bp);
+ return 0;
+out_free:
+ kmem_free(sfp);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000..dbcf58979
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+static xfs_failaddr_t xfs_dir2_data_freefind_verify(
+ struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup,
+ struct xfs_dir2_data_free **bf_ent);
+
+struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
+{
+ if (xfs_has_crc(mp))
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+ return hdr->bestfree;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+__be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16));
+}
+
+uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ if (xfs_has_ftype(mp)) {
+ uint8_t ftype = dep->name[dep->namelen];
+
+ if (likely(ftype < XFS_DIR3_FT_MAX))
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_data_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ if (xfs_has_ftype(mp))
+ dep->name[dep->namelen] = ftype;
+}
+
+/*
+ * The number of leaf entries is limited by the size of the block and the amount
+ * of space used by the data entries. We don't know how much space is used by
+ * the data entries yet, so just ensure that the count falls somewhere inside
+ * the block right now.
+ */
+static inline unsigned int
+xfs_dir2_data_max_leaf_entries(
+ struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_block_tail) -
+ geo->data_entry_offset) /
+ sizeof(struct xfs_dir2_leaf_entry);
+}
+
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return NULL if the buffer is good, otherwise the address of the error.
+ */
+xfs_failaddr_t
+__xfs_dir3_data_check(
+ struct xfs_inode *dp, /* incore inode pointer */
+ struct xfs_buf *bp) /* data block's buffer */
+{
+ xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
+ xfs_dir2_data_free_t *bf; /* bestfree table */
+ xfs_dir2_block_tail_t *btp=NULL; /* block tail */
+ int count; /* count of entries found */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_data_free_t *dfp; /* bestfree entry */
+ int freeseen; /* mask of bestfrees seen */
+ xfs_dahash_t hash; /* hash of current name */
+ int i; /* leaf index */
+ int lastfree; /* last entry was unused */
+ xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
+ struct xfs_mount *mp = bp->b_mount;
+ int stale; /* count of stale leaves */
+ struct xfs_name name;
+ unsigned int offset;
+ unsigned int end;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+
+ /*
+ * If this isn't a directory, something is seriously wrong. Bail out.
+ */
+ if (dp && !S_ISDIR(VFS_I(dp)->i_mode))
+ return __this_address;
+
+ hdr = bp->b_addr;
+ offset = geo->data_entry_offset;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ lep = xfs_dir2_block_leaf_p(btp);
+
+ if (be32_to_cpu(btp->count) >=
+ xfs_dir2_data_max_leaf_entries(geo))
+ return __this_address;
+ break;
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ break;
+ default:
+ return __this_address;
+ }
+ end = xfs_dir3_data_end_offset(geo, hdr);
+ if (!end)
+ return __this_address;
+
+ /*
+ * Account for zero bestfree entries.
+ */
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ count = lastfree = freeseen = 0;
+ if (!bf[0].length) {
+ if (bf[0].offset)
+ return __this_address;
+ freeseen |= 1 << 0;
+ }
+ if (!bf[1].length) {
+ if (bf[1].offset)
+ return __this_address;
+ freeseen |= 1 << 1;
+ }
+ if (!bf[2].length) {
+ if (bf[2].offset)
+ return __this_address;
+ freeseen |= 1 << 2;
+ }
+
+ if (be16_to_cpu(bf[0].length) < be16_to_cpu(bf[1].length))
+ return __this_address;
+ if (be16_to_cpu(bf[1].length) < be16_to_cpu(bf[2].length))
+ return __this_address;
+ /*
+ * Loop over the data/unused entries.
+ */
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ /*
+ * If it's unused, look for the space in the bestfree table.
+ * If we find it, account for that, else make sure it
+ * doesn't need to be there.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ xfs_failaddr_t fa;
+
+ if (lastfree != 0)
+ return __this_address;
+ if (offset + be16_to_cpu(dup->length) > end)
+ return __this_address;
+ if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
+ offset)
+ return __this_address;
+ fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp);
+ if (fa)
+ return fa;
+ if (dfp) {
+ i = (int)(dfp - bf);
+ if ((freeseen & (1 << i)) != 0)
+ return __this_address;
+ freeseen |= 1 << i;
+ } else {
+ if (be16_to_cpu(dup->length) >
+ be16_to_cpu(bf[2].length))
+ return __this_address;
+ }
+ offset += be16_to_cpu(dup->length);
+ lastfree = 1;
+ continue;
+ }
+ /*
+ * It's a real entry. Validate the fields.
+ * If this is a block directory then make sure it's
+ * in the leaf section of the block.
+ * The linear search is crude but this is DEBUG code.
+ */
+ if (dep->namelen == 0)
+ return __this_address;
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
+ return __this_address;
+ if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
+ return __this_address;
+ if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
+ return __this_address;
+ if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX)
+ return __this_address;
+ count++;
+ lastfree = 0;
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)hdr));
+ name.name = dep->name;
+ name.len = dep->namelen;
+ hash = xfs_dir2_hashname(mp, &name);
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if (be32_to_cpu(lep[i].address) == addr &&
+ be32_to_cpu(lep[i].hashval) == hash)
+ break;
+ }
+ if (i >= be32_to_cpu(btp->count))
+ return __this_address;
+ }
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+ }
+ /*
+ * Need to have seen all the entries and all the bestfree slots.
+ */
+ if (freeseen != 7)
+ return __this_address;
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+ if (lep[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ if (i > 0 && be32_to_cpu(lep[i].hashval) <
+ be32_to_cpu(lep[i - 1].hashval))
+ return __this_address;
+ }
+ if (count != be32_to_cpu(btp->count) - be32_to_cpu(btp->stale))
+ return __this_address;
+ if (stale != be32_to_cpu(btp->stale))
+ return __this_address;
+ }
+ return NULL;
+}
+
+#ifdef DEBUG
+void
+xfs_dir3_data_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = __xfs_dir3_data_check(dp, bp);
+ if (!fa)
+ return;
+ xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+ bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+ fa);
+ ASSERT(0);
+}
+#endif
+
+static xfs_failaddr_t
+xfs_dir3_data_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+ return __xfs_dir3_data_check(NULL, bp);
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+ break;
+ }
+}
+
+static void
+xfs_dir3_data_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_dir3_data_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_dir3_data_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_data_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
+ .verify_read = xfs_dir3_data_read_verify,
+ .verify_write = xfs_dir3_data_write_verify,
+ .verify_struct = xfs_dir3_data_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
+ .verify_read = xfs_dir3_data_reada_verify,
+ .verify_write = xfs_dir3_data_write_verify,
+};
+
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+int
+xfs_dir3_data_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags,
+ struct xfs_buf **bpp)
+{
+ xfs_failaddr_t fa;
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_data_buf_ops);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_data_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ return err;
+}
+
+int
+xfs_dir3_data_readahead(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags)
+{
+ return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK,
+ &xfs_dir3_data_reada_buf_ops);
+}
+
+/*
+ * Find the bestfree entry that exactly coincides with unused directory space
+ * or a verifier error because the bestfree data are bad.
+ */
+static xfs_failaddr_t
+xfs_dir2_data_freefind_verify(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup,
+ struct xfs_dir2_data_free **bf_ent)
+{
+ struct xfs_dir2_data_free *dfp;
+ xfs_dir2_data_aoff_t off;
+ bool matched = false;
+ bool seenzero = false;
+
+ *bf_ent = NULL;
+ off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+ /*
+ * Validate some consistency in the bestfree table.
+ * Check order, non-overlapping entries, and if we find the
+ * one we're looking for it has to be exact.
+ */
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+ if (!dfp->offset) {
+ if (dfp->length)
+ return __this_address;
+ seenzero = true;
+ continue;
+ }
+ if (seenzero)
+ return __this_address;
+ if (be16_to_cpu(dfp->offset) == off) {
+ matched = true;
+ if (dfp->length != dup->length)
+ return __this_address;
+ } else if (be16_to_cpu(dfp->offset) > off) {
+ if (off + be16_to_cpu(dup->length) >
+ be16_to_cpu(dfp->offset))
+ return __this_address;
+ } else {
+ if (be16_to_cpu(dfp->offset) +
+ be16_to_cpu(dfp->length) > off)
+ return __this_address;
+ }
+ if (!matched &&
+ be16_to_cpu(dfp->length) < be16_to_cpu(dup->length))
+ return __this_address;
+ if (dfp > &bf[0] &&
+ be16_to_cpu(dfp[-1].length) < be16_to_cpu(dfp[0].length))
+ return __this_address;
+ }
+
+ /* Looks ok so far; now try to match up with a bestfree entry. */
+ *bf_ent = xfs_dir2_data_freefind(hdr, bf, dup);
+ return NULL;
+}
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup) /* unused space */
+{
+ xfs_dir2_data_free_t *dfp; /* bestfree entry */
+ xfs_dir2_data_aoff_t off; /* offset value needed */
+
+ off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+ /*
+ * If this is smaller than the smallest bestfree entry,
+ * it can't be there since they're sorted.
+ */
+ if (be16_to_cpu(dup->length) <
+ be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ return NULL;
+ /*
+ * Look at the three bestfree entries for our guy.
+ */
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+ if (!dfp->offset)
+ return NULL;
+ if (be16_to_cpu(dfp->offset) == off)
+ return dfp;
+ }
+ /*
+ * Didn't find it. This only happens if there are duplicate lengths.
+ */
+ return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t * /* entry inserted */
+xfs_dir2_data_freeinsert(
+ struct xfs_dir2_data_hdr *hdr, /* data block pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup, /* unused space */
+ int *loghead) /* log the data header (out) */
+{
+ xfs_dir2_data_free_t new; /* new bestfree entry */
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ new.length = dup->length;
+ new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
+ /*
+ * Insert at position 0, 1, or 2; or not at all.
+ */
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+ dfp[2] = dfp[1];
+ dfp[1] = dfp[0];
+ dfp[0] = new;
+ *loghead = 1;
+ return &dfp[0];
+ }
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+ dfp[2] = dfp[1];
+ dfp[1] = new;
+ *loghead = 1;
+ return &dfp[1];
+ }
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+ dfp[2] = new;
+ *loghead = 1;
+ return &dfp[2];
+ }
+ return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */
+ int *loghead) /* out: log data header */
+{
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ /*
+ * It's the first entry, slide the next 2 up.
+ */
+ if (dfp == &bf[0]) {
+ bf[0] = bf[1];
+ bf[1] = bf[2];
+ }
+ /*
+ * It's the second entry, slide the 3rd entry up.
+ */
+ else if (dfp == &bf[1])
+ bf[1] = bf[2];
+ /*
+ * Must be the last entry.
+ */
+ else
+ ASSERT(dfp == &bf[2]);
+ /*
+ * Clear the 3rd entry, must be zero now.
+ */
+ bf[2].length = 0;
+ bf[2].offset = 0;
+ *loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
+{
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_dir2_data_free *bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ void *addr = hdr;
+ unsigned int offset = geo->data_entry_offset;
+ unsigned int end;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ /*
+ * Start by clearing the table.
+ */
+ memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+ *loghead = 1;
+
+ end = xfs_dir3_data_end_offset(geo, addr);
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = addr + offset;
+ struct xfs_dir2_data_entry *dep = addr + offset;
+
+ /*
+ * If it's a free entry, insert it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ASSERT(offset ==
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+ xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /*
+ * For active entries, check their tags and skip them.
+ */
+ ASSERT(offset ==
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+ }
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int /* error */
+xfs_dir3_data_init(
+ struct xfs_da_args *args, /* directory operation args */
+ xfs_dir2_db_t blkno, /* logical dir block number */
+ struct xfs_buf **bpp) /* output block buffer */
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = args->geo;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *bf;
+ int error;
+ int i;
+
+ /*
+ * Get the buffer set up for the block.
+ */
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+ &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
+ /*
+ * Initialize the header.
+ */
+ hdr = bp->b_addr;
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
+
+ } else
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ bf[0].offset = cpu_to_be16(geo->data_entry_offset);
+ bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset);
+ for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+ bf[i].length = 0;
+ bf[i].offset = 0;
+ }
+
+ /*
+ * Set up an unused entry for the block's body.
+ */
+ dup = bp->b_addr + geo->data_entry_offset;
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ dup->length = bf[0].length;
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+
+ /*
+ * Log it and return it.
+ */
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_data_log_unused(args, bp, dup);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ xfs_dir2_data_entry_t *dep) /* data entry pointer */
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+ (uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) -
+ (char *)hdr - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
+{
+#ifdef DEBUG
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1);
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ xfs_dir2_data_unused_t *dup) /* data unused pointer */
+{
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ /*
+ * Log the first part of the unused entry.
+ */
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+ (uint)((char *)&dup->length + sizeof(dup->length) -
+ 1 - (char *)hdr));
+ /*
+ * Log the end (tag) of the unused entry.
+ */
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+ sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ xfs_dir2_data_aoff_t offset, /* starting byte offset */
+ xfs_dir2_data_aoff_t len, /* length in bytes */
+ int *needlogp, /* out: log header */
+ int *needscanp) /* out: regen bestfree */
+{
+ xfs_dir2_data_hdr_t *hdr; /* data block pointer */
+ xfs_dir2_data_free_t *dfp; /* bestfree pointer */
+ int needscan; /* need to regen bestfree */
+ xfs_dir2_data_unused_t *newdup; /* new unused entry */
+ xfs_dir2_data_unused_t *postdup; /* unused entry after us */
+ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ unsigned int end;
+ struct xfs_dir2_data_free *bf;
+
+ hdr = bp->b_addr;
+
+ /*
+ * Figure out where the end of the data area is.
+ */
+ end = xfs_dir3_data_end_offset(args->geo, hdr);
+ ASSERT(end != 0);
+
+ /*
+ * If this isn't the start of the block, then back up to
+ * the previous entry and see if it's free.
+ */
+ if (offset > args->geo->data_entry_offset) {
+ __be16 *tagp; /* tag just before us */
+
+ tagp = (__be16 *)((char *)hdr + offset) - 1;
+ prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+ if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ prevdup = NULL;
+ } else
+ prevdup = NULL;
+ /*
+ * If this isn't the end of the block, see if the entry after
+ * us is free.
+ */
+ if (offset + len < end) {
+ postdup =
+ (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ postdup = NULL;
+ } else
+ postdup = NULL;
+ ASSERT(*needscanp == 0);
+ needscan = 0;
+ /*
+ * Previous and following entries are both free,
+ * merge everything into a single free entry.
+ */
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
+ if (prevdup && postdup) {
+ xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
+
+ /*
+ * See if prevdup and/or postdup are in bestfree table.
+ */
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+ /*
+ * We need a rescan unless there are exactly 2 free entries
+ * namely our two. Then we know what's happening, otherwise
+ * since the third bestfree is there, there might be more
+ * entries.
+ */
+ needscan = (bf[2].length != 0);
+ /*
+ * Fix up the new big freespace.
+ */
+ be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
+ if (!needscan) {
+ /*
+ * Has to be the case that entries 0 and 1 are
+ * dfp and dfp2 (don't know which is which), and
+ * entry 2 is empty.
+ * Remove entry 1 first then entry 0.
+ */
+ ASSERT(dfp && dfp2);
+ if (dfp == &bf[1]) {
+ dfp = &bf[0];
+ ASSERT(dfp2 == dfp);
+ dfp2 = &bf[1];
+ }
+ xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ /*
+ * Now insert the new entry.
+ */
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+ needlogp);
+ ASSERT(dfp == &bf[0]);
+ ASSERT(dfp->length == prevdup->length);
+ ASSERT(!dfp[1].length);
+ ASSERT(!dfp[2].length);
+ }
+ }
+ /*
+ * The entry before us is free, merge with it.
+ */
+ else if (prevdup) {
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ be16_add_cpu(&prevdup->length, len);
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
+ /*
+ * If the previous entry was in the table, the new entry
+ * is longer, so it will be in the table too. Remove
+ * the old one and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+ }
+ /*
+ * Otherwise we need a scan if the new entry is big enough.
+ */
+ else {
+ needscan = be16_to_cpu(prevdup->length) >
+ be16_to_cpu(bf[2].length);
+ }
+ }
+ /*
+ * The following entry is free, merge with it.
+ */
+ else if (postdup) {
+ dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ /*
+ * If the following entry was in the table, the new entry
+ * is longer, so it will be in the table too. Remove
+ * the old one and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+ }
+ /*
+ * Otherwise we need a scan if the new entry is big enough.
+ */
+ else {
+ needscan = be16_to_cpu(newdup->length) >
+ be16_to_cpu(bf[2].length);
+ }
+ }
+ /*
+ * Neither neighbor is free. Make a new entry.
+ */
+ else {
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+ }
+ *needscanp = needscan;
+}
+
+/* Check our free data for obvious signs of corruption. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_free(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_unused *dup,
+ xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len)
+{
+ if (hdr->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) &&
+ hdr->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return __this_address;
+ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ return __this_address;
+ if (offset < (char *)dup - (char *)hdr)
+ return __this_address;
+ if (offset + len > (char *)dup + be16_to_cpu(dup->length) - (char *)hdr)
+ return __this_address;
+ if ((char *)dup - (char *)hdr !=
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)))
+ return __this_address;
+ return NULL;
+}
+
+/* Sanity-check a new bestfree entry. */
+static inline xfs_failaddr_t
+xfs_dir2_data_check_new_free(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_free *dfp,
+ struct xfs_dir2_data_unused *newdup)
+{
+ if (dfp == NULL)
+ return __this_address;
+ if (dfp->length != newdup->length)
+ return __this_address;
+ if (be16_to_cpu(dfp->offset) != (char *)newdup - (char *)hdr)
+ return __this_address;
+ return NULL;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+int
+xfs_dir2_data_use_free(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ xfs_dir2_data_unused_t *dup, /* unused entry */
+ xfs_dir2_data_aoff_t offset, /* starting offset to use */
+ xfs_dir2_data_aoff_t len, /* length to use */
+ int *needlogp, /* out: need to log header */
+ int *needscanp) /* out: need regen bestfree */
+{
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_data_free_t *dfp; /* bestfree pointer */
+ xfs_dir2_data_unused_t *newdup; /* new unused entry */
+ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
+ struct xfs_dir2_data_free *bf;
+ xfs_failaddr_t fa;
+ int matchback; /* matches end of freespace */
+ int matchfront; /* matches start of freespace */
+ int needscan; /* need to regen bestfree */
+ int oldlen; /* old unused entry's length */
+
+ hdr = bp->b_addr;
+ fa = xfs_dir2_data_check_free(hdr, dup, offset, len);
+ if (fa)
+ goto corrupt;
+ /*
+ * Look up the entry in the bestfree table.
+ */
+ oldlen = be16_to_cpu(dup->length);
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+ ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+ /*
+ * Check for alignment with front and back of the entry.
+ */
+ matchfront = (char *)dup - (char *)hdr == offset;
+ matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+ ASSERT(*needscanp == 0);
+ needscan = 0;
+ /*
+ * If we matched it exactly we just need to get rid of it from
+ * the bestfree table.
+ */
+ if (matchfront && matchback) {
+ if (dfp) {
+ needscan = (bf[2].offset != 0);
+ if (!needscan)
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
+ }
+ }
+ /*
+ * We match the first part of the entry.
+ * Make a new entry with the remaining freespace.
+ */
+ else if (matchfront) {
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(oldlen - len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ /*
+ * If it was in the table, remove it and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+ if (fa)
+ goto corrupt;
+ /*
+ * If we got inserted at the last slot,
+ * that means we don't know if there was a better
+ * choice for the last slot, or not. Rescan.
+ */
+ needscan = dfp == &bf[2];
+ }
+ }
+ /*
+ * We match the last part of the entry.
+ * Trim the allocated space off the tail of the entry.
+ */
+ else if (matchback) {
+ newdup = dup;
+ newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ /*
+ * If it was in the table, remove it and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup);
+ if (fa)
+ goto corrupt;
+ /*
+ * If we got inserted at the last slot,
+ * that means we don't know if there was a better
+ * choice for the last slot, or not. Rescan.
+ */
+ needscan = dfp == &bf[2];
+ }
+ }
+ /*
+ * Poking out the middle of an entry.
+ * Make two new entries.
+ */
+ else {
+ newdup = dup;
+ newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup2) =
+ cpu_to_be16((char *)newdup2 - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup2);
+ /*
+ * If the old entry was in the table, we need to scan
+ * if the 3rd entry was valid, since these entries
+ * are smaller than the old one.
+ * If we don't need to scan that means there were 1 or 2
+ * entries in the table, and removing the old and adding
+ * the 2 new will work.
+ */
+ if (dfp) {
+ needscan = (bf[2].length != 0);
+ if (!needscan) {
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+ needlogp);
+ }
+ }
+ }
+ *needscanp = needscan;
+ return 0;
+corrupt:
+ xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
+ hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+ return -EFSCORRUPTED;
+}
+
+/* Find the end of the entry data in a data/block format dir block. */
+unsigned int
+xfs_dir3_data_end_offset(
+ struct xfs_da_geometry *geo,
+ struct xfs_dir2_data_hdr *hdr)
+{
+ void *p;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+ return p - (void *)hdr;
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ return geo->blksize;
+ default:
+ return 0;
+ }
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000..cb9e950a9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+ int *indexp, struct xfs_buf **dbpp,
+ struct xfs_dir3_icleaf_hdr *leafhdr);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+
+void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.count);
+ to->stale = be16_to_cpu(from3->hdr.stale);
+ to->ents = from3->__ents;
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+ to->ents = from->__ents;
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+ }
+}
+
+void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.count = cpu_to_be16(from->count);
+ to3->hdr.stale = cpu_to_be16(from->stale);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+ }
+}
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+static xfs_failaddr_t
+xfs_dir3_leaf1_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+ return __this_address;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
+}
+
+static inline void
+xfs_dir3_leaf_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_leaf1_check(dp, bp);
+ if (!fa)
+ return;
+ xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+ bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+ fa);
+ ASSERT(0);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+
+xfs_failaddr_t
+xfs_dir3_leaf_check_int(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf,
+ bool expensive_checking)
+{
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
+ bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC);
+
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+ /*
+ * XXX (dgc): This value is not restrictive enough.
+ * Should factor in the size of the bests table as well.
+ * We can deduce a value for that from i_disk_size.
+ */
+ if (hdr->count > geo->leaf_max_ents)
+ return __this_address;
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if (isleaf1 &&
+ (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ return __this_address;
+
+ if (!expensive_checking)
+ return NULL;
+
+ /* Check hash value order, count stale entries. */
+ for (i = stale = 0; i < hdr->count; i++) {
+ if (i + 1 < hdr->count) {
+ if (be32_to_cpu(hdr->ents[i].hashval) >
+ be32_to_cpu(hdr->ents[i + 1].hashval))
+ return __this_address;
+ }
+ if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+ be32_to_cpu(hdr->ents[i].address)) >=
+ be32_to_cpu(ltp->bestcount))
+ return __this_address;
+ }
+ if (hdr->stale != stale)
+ return __this_address;
+ return NULL;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static xfs_failaddr_t
+xfs_dir3_leaf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
+
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr);
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true);
+}
+
+static void
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_dir3_leaf_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_leaf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .name = "xfs_dir3_leaf1",
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .name = "xfs_dir3_leafn",
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
+};
+
+int
+xfs_dir3_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leaf1_buf_ops);
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ return err;
+}
+
+int
+xfs_dir3_leafn_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leafn_buf_ops);
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ uint16_t type)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+ memset(leaf3, 0, sizeof(*leaf3));
+
+ leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+ ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+ : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ leaf3->info.owner = cpu_to_be64(owner);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
+ } else {
+ memset(leaf, 0, sizeof(*leaf));
+ leaf->hdr.info.magic = cpu_to_be16(type);
+ }
+
+ /*
+ * If it's a leaf-format directory initialize the tail.
+ * Caller is responsible for initialising the bests table.
+ */
+ if (type == XFS_DIR2_LEAF1_MAGIC) {
+ struct xfs_dir2_leaf_tail *ltp;
+
+ ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+ ltp->bestcount = 0;
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+ } else {
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ }
+}
+
+int
+xfs_dir3_leaf_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t bno,
+ struct xfs_buf **bpp,
+ uint16_t magic)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+ bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+ &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_log_header(args, bp);
+ if (magic == XFS_DIR2_LEAF1_MAGIC)
+ xfs_dir3_leaf_log_tail(args, bp);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int /* error */
+xfs_dir2_block_to_leaf(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *dbp) /* input block's buffer */
+{
+ __be16 *bestsp; /* leaf's bestsp entries */
+ xfs_dablk_t blkno; /* leaf block's bno */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */
+ xfs_dir2_block_tail_t *btp; /* block's tail */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ struct xfs_buf *lbp; /* leaf block's buffer */
+ xfs_dir2_db_t ldb; /* leaf block's bno */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
+ int needlog; /* need to log block header */
+ int needscan; /* need to rescan bestfree */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_block_to_leaf(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ /*
+ * Add the leaf block to the inode.
+ * This interface will only put blocks in the leaf/node range.
+ * Since that's empty now, we'll get the root (block 0 in range).
+ */
+ if ((error = xfs_da_grow_inode(args, &blkno))) {
+ return error;
+ }
+ ldb = xfs_dir2_da_to_db(args->geo, blkno);
+ ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+ /*
+ * Initialize the leaf block, get a buffer for it.
+ */
+ error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+ if (error)
+ return error;
+
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
+ xfs_dir3_data_check(dp, dbp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+ /*
+ * Set the counts in the leaf header.
+ */
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ leafhdr.count = be32_to_cpu(btp->count);
+ leafhdr.stale = be32_to_cpu(btp->stale);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
+ /*
+ * Could compact these but I think we always do the conversion
+ * after squeezing out stale entries.
+ */
+ memcpy(leafhdr.ents, blp,
+ be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry));
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1);
+ needscan = 0;
+ needlog = 1;
+ /*
+ * Make the space formerly occupied by the leaf entries and block
+ * tail be free.
+ */
+ xfs_dir2_data_make_free(args, dbp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+ (char *)blp),
+ &needlog, &needscan);
+ /*
+ * Fix up the block header, make it a data block.
+ */
+ dbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ else
+ hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ /*
+ * Set up leaf tail and bests table.
+ */
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp->bestcount = cpu_to_be32(1);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ bestsp[0] = bf[0].length;
+ /*
+ * Log the data header and leaf bests table.
+ */
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+ return 0;
+}
+
+STATIC void
+xfs_dir3_leaf_find_stale(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
+ int index,
+ int *lowstale,
+ int *highstale)
+{
+ /*
+ * Find the first stale entry before our index, if any.
+ */
+ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+ if (ents[*lowstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ break;
+ }
+
+ /*
+ * Find the first stale entry at or after our index, if any.
+ * Stop if the result would require moving more entries than using
+ * lowstale.
+ */
+ for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+ if (ents[*highstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ break;
+ if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+ break;
+ }
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
+ int index, /* leaf table position */
+ int compact, /* need to compact leaves */
+ int lowstale, /* index of prev stale leaf */
+ int highstale, /* index of next stale leaf */
+ int *lfloglow, /* low leaf logging index */
+ int *lfloghigh) /* high leaf logging index */
+{
+ if (!leafhdr->stale) {
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
+
+ /*
+ * Now we need to make room to insert the leaf entry.
+ *
+ * If there are no stale entries, just insert a hole at index.
+ */
+ lep = &ents[index];
+ if (index < leafhdr->count)
+ memmove(lep + 1, lep,
+ (leafhdr->count - index) * sizeof(*lep));
+
+ /*
+ * Record low and high logging indices for the leaf.
+ */
+ *lfloglow = index;
+ *lfloghigh = leafhdr->count++;
+ return lep;
+ }
+
+ /*
+ * There are stale entries.
+ *
+ * We will use one of them for the new entry. It's probably not at
+ * the right location, so we'll have to shift some up or down first.
+ *
+ * If we didn't compact before, we need to find the nearest stale
+ * entries before and after our insertion point.
+ */
+ if (compact == 0)
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+ &lowstale, &highstale);
+
+ /*
+ * If the low one is better, use it.
+ */
+ if (lowstale >= 0 &&
+ (highstale == leafhdr->count ||
+ index - lowstale - 1 < highstale - index)) {
+ ASSERT(index - lowstale - 1 >= 0);
+ ASSERT(ents[lowstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+ /*
+ * Copy entries up to cover the stale entry and make room
+ * for the new entry.
+ */
+ if (index - lowstale - 1 > 0) {
+ memmove(&ents[lowstale], &ents[lowstale + 1],
+ (index - lowstale - 1) *
+ sizeof(xfs_dir2_leaf_entry_t));
+ }
+ *lfloglow = min(lowstale, *lfloglow);
+ *lfloghigh = max(index - 1, *lfloghigh);
+ leafhdr->stale--;
+ return &ents[index - 1];
+ }
+
+ /*
+ * The high one is better, so use that one.
+ */
+ ASSERT(highstale - index >= 0);
+ ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+ /*
+ * Copy entries down to cover the stale entry and make room for the
+ * new entry.
+ */
+ if (highstale - index > 0) {
+ memmove(&ents[index + 1], &ents[index],
+ (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+ }
+ *lfloglow = min(index, *lfloglow);
+ *lfloghigh = max(highstale, *lfloghigh);
+ leafhdr->stale--;
+ return &ents[index];
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int /* error */
+xfs_dir2_leaf_addname(
+ struct xfs_da_args *args) /* operation arguments */
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
+ __be16 *bestsp; /* freespace table in leaf */
+ __be16 *tagp; /* end of data entry */
+ struct xfs_buf *dbp; /* data block buffer */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
+ int error; /* error return value */
+ int grown; /* allocated new data block */
+ int highstale = 0; /* index of next stale leaf */
+ int i; /* temporary, index */
+ int index; /* leaf table position */
+ int length; /* length of new entry */
+ int lfloglow; /* low leaf logging index */
+ int lfloghigh; /* high leaf logging index */
+ int lowstale = 0; /* index of prev stale leaf */
+ int needbytes; /* leaf block bytes needed */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data free */
+ xfs_dir2_db_t use_block; /* data block number */
+
+ trace_xfs_dir2_leaf_addname(args);
+
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ if (error)
+ return error;
+
+ /*
+ * Look up the entry by hash value and name.
+ * We know it's not there, our caller has already done a lookup.
+ * So the index is of the entry to insert in front of.
+ * But if there are dup hash values the index is of the first of those.
+ */
+ index = xfs_dir2_leaf_search_hash(args, lbp);
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+
+ /*
+ * See if there are any entries with the same hash value
+ * and space in their block for the new entry.
+ * This is good because it puts multiple same-hash value entries
+ * in a data block, improving the lookup of those entries.
+ */
+ for (use_block = -1, lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ index++, lep++) {
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ ASSERT(i < be32_to_cpu(ltp->bestcount));
+ ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+ if (be16_to_cpu(bestsp[i]) >= length) {
+ use_block = i;
+ break;
+ }
+ }
+ /*
+ * Didn't find a block yet, linear search all the data blocks.
+ */
+ if (use_block == -1) {
+ for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+ /*
+ * Remember a block we see that's missing.
+ */
+ if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+ use_block == -1)
+ use_block = i;
+ else if (be16_to_cpu(bestsp[i]) >= length) {
+ use_block = i;
+ break;
+ }
+ }
+ }
+ /*
+ * How many bytes do we need in the leaf block?
+ */
+ needbytes = 0;
+ if (!leafhdr.stale)
+ needbytes += sizeof(xfs_dir2_leaf_entry_t);
+ if (use_block == -1)
+ needbytes += sizeof(xfs_dir2_data_off_t);
+
+ /*
+ * Now kill use_block if it refers to a missing block, so we
+ * can use it as an indication of allocation needed.
+ */
+ if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+ use_block = -1;
+ /*
+ * If we don't have enough free bytes but we can make enough
+ * by compacting out stale entries, we'll do that.
+ */
+ if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+ leafhdr.stale > 1)
+ compact = 1;
+
+ /*
+ * Otherwise if we don't have enough free bytes we need to
+ * convert to node form.
+ */
+ else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+ /*
+ * Just checking or no space reservation, give up.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
+ xfs_trans_brelse(tp, lbp);
+ return -ENOSPC;
+ }
+ /*
+ * Convert to node form.
+ */
+ error = xfs_dir2_leaf_to_node(args, lbp);
+ if (error)
+ return error;
+ /*
+ * Then add the new entry.
+ */
+ return xfs_dir2_node_addname(args);
+ }
+ /*
+ * Otherwise it will fit without compaction.
+ */
+ else
+ compact = 0;
+ /*
+ * If just checking, then it will fit unless we needed to allocate
+ * a new data block.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_trans_brelse(tp, lbp);
+ return use_block == -1 ? -ENOSPC : 0;
+ }
+ /*
+ * If no allocations are allowed, return now before we've
+ * changed anything.
+ */
+ if (args->total == 0 && use_block == -1) {
+ xfs_trans_brelse(tp, lbp);
+ return -ENOSPC;
+ }
+ /*
+ * Need to compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and we'll shift that one to our insertion
+ * point later.
+ */
+ if (compact) {
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
+ }
+ /*
+ * There are stale entries, so we'll need log-low and log-high
+ * impossibly bad values later.
+ */
+ else if (leafhdr.stale) {
+ lfloglow = leafhdr.count;
+ lfloghigh = -1;
+ }
+ /*
+ * If there was no data block space found, we need to allocate
+ * a new one.
+ */
+ if (use_block == -1) {
+ /*
+ * Add the new data block.
+ */
+ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+ &use_block))) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ /*
+ * Initialize the block.
+ */
+ if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ /*
+ * If we're adding a new data block on the end we need to
+ * extend the bests table. Copy it up one entry.
+ */
+ if (use_block >= be32_to_cpu(ltp->bestcount)) {
+ bestsp--;
+ memmove(&bestsp[0], &bestsp[1],
+ be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+ be32_add_cpu(&ltp->bestcount, 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
+ }
+ /*
+ * If we're filling in a previously empty block just log it.
+ */
+ else
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+ hdr = dbp->b_addr;
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+ bestsp[use_block] = bf[0].length;
+ grown = 1;
+ } else {
+ /*
+ * Already had space in some data block.
+ * Just read that one in.
+ */
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, use_block),
+ 0, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ hdr = dbp->b_addr;
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+ grown = 0;
+ }
+ /*
+ * Point to the biggest freespace in our data block.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ needscan = needlog = 0;
+ /*
+ * Mark the initial part of our freespace in use for the new entry.
+ */
+ error = xfs_dir2_data_use_free(args, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ length, &needlog, &needscan);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ /*
+ * Initialize our new entry (at last).
+ */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, dep->namelen);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ /*
+ * Need to scan fix up the bestfree table.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ /*
+ * Need to log the data block's header.
+ */
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+ /*
+ * If the bests table needs to be changed, do it.
+ * Log the change unless we've already done that.
+ */
+ if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+ bestsp[use_block] = bf[0].length;
+ if (!grown)
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+ }
+
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+ highstale, &lfloglow, &lfloghigh);
+
+ /*
+ * Fill in the new leaf entry.
+ */
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(
+ xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+ be16_to_cpu(*tagp)));
+ /*
+ * Log the leaf fields and give up the buffers.
+ */
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
+ return 0;
+}
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_buf *bp) /* leaf buffer */
+{
+ int from; /* source leaf index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int loglow; /* first leaf entry to log */
+ int to; /* target leaf index */
+ struct xfs_inode *dp = args->dp;
+
+ leaf = bp->b_addr;
+ if (!leafhdr->stale)
+ return;
+
+ /*
+ * Compress out the stale entries in place.
+ */
+ for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+ if (leafhdr->ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ continue;
+ /*
+ * Only actually copy the entries that are different.
+ */
+ if (from > to) {
+ if (loglow == -1)
+ loglow = to;
+ leafhdr->ents[to] = leafhdr->ents[from];
+ }
+ to++;
+ }
+ /*
+ * Update and log the header, log the leaf entries.
+ */
+ ASSERT(leafhdr->stale == from - to);
+ leafhdr->count -= leafhdr->stale;
+ leafhdr->stale = 0;
+
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+ if (loglow != -1)
+ xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
+ int *indexp, /* insertion index */
+ int *lowstalep, /* out: stale entry before us */
+ int *highstalep, /* out: stale entry after us */
+ int *lowlogp, /* out: low log index */
+ int *highlogp) /* out: high log index */
+{
+ int from; /* source copy index */
+ int highstale; /* stale entry at/after index */
+ int index; /* insertion index */
+ int keepstale; /* source index of kept stale */
+ int lowstale; /* stale entry before index */
+ int newindex=0; /* new insertion index */
+ int to; /* destination copy index */
+
+ ASSERT(leafhdr->stale > 1);
+ index = *indexp;
+
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
+ /*
+ * Pick the better of lowstale and highstale.
+ */
+ if (lowstale >= 0 &&
+ (highstale == leafhdr->count ||
+ index - lowstale <= highstale - index))
+ keepstale = lowstale;
+ else
+ keepstale = highstale;
+ /*
+ * Copy the entries in place, removing all the stale entries
+ * except keepstale.
+ */
+ for (from = to = 0; from < leafhdr->count; from++) {
+ /*
+ * Notice the new value of index.
+ */
+ if (index == from)
+ newindex = to;
+ if (from != keepstale &&
+ ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ if (from == to)
+ *lowlogp = to;
+ continue;
+ }
+ /*
+ * Record the new keepstale value for the insertion.
+ */
+ if (from == keepstale)
+ lowstale = highstale = to;
+ /*
+ * Copy only the entries that have moved.
+ */
+ if (from > to)
+ ents[to] = ents[from];
+ to++;
+ }
+ ASSERT(from > to);
+ /*
+ * If the insertion point was past the last entry,
+ * set the new insertion point accordingly.
+ */
+ if (index == from)
+ newindex = to;
+ *indexp = newindex;
+ /*
+ * Adjust the leaf header values.
+ */
+ leafhdr->count -= from - to;
+ leafhdr->stale = 1;
+ /*
+ * Remember the low/high stale value only in the "right"
+ * direction.
+ */
+ if (lowstale >= newindex)
+ lowstale = -1;
+ else
+ highstale = leafhdr->count;
+ *highlogp = leafhdr->count - 1;
+ *lowstalep = lowstale;
+ *highstalep = highstale;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp, /* leaf buffer */
+ int first, /* first entry to log */
+ int last) /* last entry to log */
+{
+ __be16 *firstb; /* pointer to first entry */
+ __be16 *lastb; /* pointer to last entry */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+ lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstb - (char *)leaf),
+ (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+ struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
+ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ firstlep = &hdr->ents[first];
+ lastlep = &hdr->ents[last];
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstlep - (char *)leaf),
+ (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&leaf->hdr - (char *)leaf),
+ args->geo->leaf_hdr_size - 1);
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(args->geo->blksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_buf *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* found entry index */
+ struct xfs_buf *lbp; /* leaf buffer */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_lookup(args);
+
+ /*
+ * Look up name in the leaf block, returning both buffers and index.
+ */
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
+ return error;
+
+ tp = args->trans;
+ dp = args->dp;
+ xfs_dir3_leaf_check(dp, lbp);
+
+ /*
+ * Get to the leaf entry and contained data entry address.
+ */
+ lep = &leafhdr.ents[index];
+
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+ /*
+ * Return the found inode number & CI name if appropriate
+ */
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
+ return error;
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int /* error */
+xfs_dir2_leaf_lookup_int(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf **lbpp, /* out: leaf buffer */
+ int *indexp, /* out: index in leaf block */
+ struct xfs_buf **dbpp, /* out: data buffer */
+ struct xfs_dir3_icleaf_hdr *leafhdr)
+{
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ struct xfs_buf *dbp = NULL; /* data buffer */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* index in leaf block */
+ struct xfs_buf *lbp; /* leaf buffer */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ xfs_dir2_db_t cidb = -1; /* case match data block no. */
+ enum xfs_dacmp cmp; /* name compare result */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ if (error)
+ return error;
+
+ *lbpp = lbp;
+ leaf = lbp->b_addr;
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf);
+
+ /*
+ * Look for the first leaf entry with our hash value.
+ */
+ index = xfs_dir2_leaf_search_hash(args, lbp);
+ /*
+ * Loop over all the entries with the right hash value
+ * looking to match the name.
+ */
+ for (lep = &leafhdr->ents[index];
+ index < leafhdr->count &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip over stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Get the new data block number.
+ */
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
+ /*
+ * If it's not the same as the old data block number,
+ * need to pitch the old one and read the new one.
+ */
+ if (newdb != curdb) {
+ if (dbp)
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, newdb),
+ 0, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ curdb = newdb;
+ }
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
+ /*
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
+ */
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ *indexp = index;
+ /* case exact match: return the current buffer. */
+ if (cmp == XFS_CMP_EXACT) {
+ *dbpp = dbp;
+ return 0;
+ }
+ cidb = curdb;
+ }
+ }
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or remove).
+ * If a case-insensitive match was found earlier, re-read the
+ * appropriate data block if required and return it.
+ */
+ if (args->cmpresult == XFS_CMP_CASE) {
+ ASSERT(cidb != -1);
+ if (cidb != curdb) {
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, cidb),
+ 0, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
+ }
+ *dbpp = dbp;
+ return 0;
+ }
+ /*
+ * No match found, return -ENOENT.
+ */
+ ASSERT(cidb == -1);
+ if (dbp)
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
+ return -ENOENT;
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int /* error */
+xfs_dir2_leaf_removename(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_da_geometry *geo = args->geo;
+ __be16 *bestsp; /* leaf block best freespace */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_db_t db; /* data block number */
+ struct xfs_buf *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data entry structure */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dir2_db_t i; /* temporary data block # */
+ int index; /* index into leaf entries */
+ struct xfs_buf *lbp; /* leaf buffer */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data frees */
+ xfs_dir2_data_off_t oldbest; /* old value of best free */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_removename(args);
+
+ /*
+ * Lookup the leaf entry, get the leaf and data blocks read in.
+ */
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
+ return error;
+
+ dp = args->dp;
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
+ xfs_dir3_data_check(dp, dbp);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+ /*
+ * Point to the leaf entry, use that to point to the data entry.
+ */
+ lep = &leafhdr.ents[index];
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)));
+ needscan = needlog = 0;
+ oldbest = be16_to_cpu(bf[0].length);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ if (be16_to_cpu(bestsp[db]) != oldbest) {
+ xfs_buf_mark_corrupt(lbp);
+ return -EFSCORRUPTED;
+ }
+ /*
+ * Mark the former data entry unused.
+ */
+ xfs_dir2_data_make_free(args, dbp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
+ /*
+ * We just mark the leaf entry stale by putting a null in it.
+ */
+ leafhdr.stale++;
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index);
+
+ /*
+ * Scan the freespace in the data block again if necessary,
+ * log the data block header if necessary.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+ /*
+ * If the longest freespace in the data block has changed,
+ * put the new value in the bests table and log that.
+ */
+ if (be16_to_cpu(bf[0].length) != oldbest) {
+ bestsp[db] = bf[0].length;
+ xfs_dir3_leaf_log_bests(args, lbp, db, db);
+ }
+ xfs_dir3_data_check(dp, dbp);
+ /*
+ * If the data block is now empty then get rid of the data block.
+ */
+ if (be16_to_cpu(bf[0].length) ==
+ geo->blksize - geo->data_entry_offset) {
+ ASSERT(db != geo->datablk);
+ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+ /*
+ * Nope, can't get rid of it because it caused
+ * allocation of a bmap btree block to do so.
+ * Just go on, returning success, leaving the
+ * empty block in place.
+ */
+ if (error == -ENOSPC && args->total == 0)
+ error = 0;
+ xfs_dir3_leaf_check(dp, lbp);
+ return error;
+ }
+ dbp = NULL;
+ /*
+ * If this is the last data block then compact the
+ * bests table by getting rid of entries.
+ */
+ if (db == be32_to_cpu(ltp->bestcount) - 1) {
+ /*
+ * Look for the last active entry (i).
+ */
+ for (i = db - 1; i > 0; i--) {
+ if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+ break;
+ }
+ /*
+ * Copy the table down so inactive entries at the
+ * end are removed.
+ */
+ memmove(&bestsp[db - i], bestsp,
+ (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+ be32_add_cpu(&ltp->bestcount, -(db - i));
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
+ } else
+ bestsp[db] = cpu_to_be16(NULLDATAOFF);
+ }
+ /*
+ * If the data block was not the first one, drop it.
+ */
+ else if (db != geo->datablk)
+ dbp = NULL;
+
+ xfs_dir3_leaf_check(dp, lbp);
+ /*
+ * See if we can convert to block form.
+ */
+ return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int /* error */
+xfs_dir2_leaf_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_buf *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* index of leaf entry */
+ struct xfs_buf *lbp; /* leaf buffer */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_replace(args);
+
+ /*
+ * Look up the entry.
+ */
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
+ return error;
+
+ dp = args->dp;
+ /*
+ * Point to the leaf entry, get data address from it.
+ */
+ lep = &leafhdr.ents[index];
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+ ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+ /*
+ * Put the new inode number in, log it.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tp = args->trans;
+ xfs_dir2_data_log_entry(args, dbp, dep);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_trans_brelse(tp, lbp);
+ return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int /* index value */
+xfs_dir2_leaf_search_hash(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *lbp) /* leaf buffer */
+{
+ xfs_dahash_t hash=0; /* hash from this entry */
+ xfs_dahash_t hashwant; /* hash value looking for */
+ int high; /* high leaf index */
+ int low; /* low leaf index */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int mid=0; /* current leaf index */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr);
+
+ /*
+ * Note, the table cannot be empty, so we have to go through the loop.
+ * Binary search the leaf entries looking for our hash value.
+ */
+ for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1,
+ hashwant = args->hashval;
+ low <= high; ) {
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+ break;
+ if (hash < hashwant)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+ /*
+ * Found one, back up through all the equal hash values.
+ */
+ if (hash == hashwant) {
+ while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+ mid--;
+ }
+ }
+ /*
+ * Need to point to an entry higher than ours.
+ */
+ else if (hash < hashwant)
+ mid++;
+ return mid;
+}
+
+/*
+ * Trim off a trailing data block. We know it's empty since the leaf
+ * freespace table says so.
+ */
+int /* error */
+xfs_dir2_leaf_trim_data(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *lbp, /* leaf buffer */
+ xfs_dir2_db_t db) /* data block number */
+{
+ struct xfs_da_geometry *geo = args->geo;
+ __be16 *bestsp; /* leaf bests table */
+ struct xfs_buf *dbp; /* data block buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ tp = args->trans;
+ /*
+ * Read the offending data block. We need its buffer.
+ */
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
+ if (error)
+ return error;
+
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+#ifdef DEBUG
+{
+ struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+ struct xfs_dir2_data_free *bf =
+ xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+ ASSERT(be16_to_cpu(bf[0].length) ==
+ geo->blksize - geo->data_entry_offset);
+ ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+
+ /*
+ * Get rid of the data block.
+ */
+ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+ ASSERT(error != -ENOSPC);
+ xfs_trans_brelse(tp, dbp);
+ return error;
+ }
+ /*
+ * Eliminate the last bests entry from the table.
+ */
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ be32_add_cpu(&ltp->bestcount, -1);
+ memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ return 0;
+}
+
+static inline size_t
+xfs_dir3_leaf_size(
+ struct xfs_dir3_icleaf_hdr *hdr,
+ int counts)
+{
+ int entries;
+ int hdrsize;
+
+ entries = hdr->count - hdr->stale;
+ if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+ hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+ else
+ hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+ return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+ + counts * sizeof(xfs_dir2_data_off_t)
+ + sizeof(xfs_dir2_leaf_tail_t);
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int /* error */
+xfs_dir2_node_to_leaf(
+ xfs_da_state_t *state) /* directory operation state */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ struct xfs_buf *fbp; /* buffer for freespace block */
+ xfs_fileoff_t fo; /* freespace file offset */
+ struct xfs_buf *lbp; /* buffer for leaf block */
+ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int rval; /* successful free trim? */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ /*
+ * There's more than a leaf level in the btree, so there must
+ * be multiple leafn blocks. Give up.
+ */
+ if (state->path.active > 1)
+ return 0;
+ args = state->args;
+
+ trace_xfs_dir2_node_to_leaf(args);
+
+ mp = state->mp;
+ dp = args->dp;
+ tp = args->trans;
+ /*
+ * Get the last offset in the file.
+ */
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+ return error;
+ }
+ fo -= args->geo->fsbcount;
+ /*
+ * If there are freespace blocks other than the first one,
+ * take this opportunity to remove trailing empty freespace blocks
+ * that may have been left behind during no-space-reservation
+ * operations.
+ */
+ while (fo > args->geo->freeblk) {
+ if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+ return error;
+ }
+ if (rval)
+ fo -= args->geo->fsbcount;
+ else
+ return 0;
+ }
+ /*
+ * Now find the block just before the freespace block.
+ */
+ if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+ return error;
+ }
+ /*
+ * If it's not the single leaf block, give up.
+ */
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+ return 0;
+ lbp = state->path.blk[0].bp;
+ leaf = lbp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+ /*
+ * Read the freespace block.
+ */
+ error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
+ if (error)
+ return error;
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
+
+ ASSERT(!freehdr.firstdb);
+
+ /*
+ * Now see if the leafn and free data will fit in a leaf1.
+ * If not, release the buffer and give up.
+ */
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+ xfs_trans_brelse(tp, fbp);
+ return 0;
+ }
+
+ /*
+ * If the leaf has any stale entries in it, compress them out.
+ */
+ if (leafhdr.stale)
+ xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+ lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+ leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+ ? XFS_DIR2_LEAF1_MAGIC
+ : XFS_DIR3_LEAF1_MAGIC;
+
+ /*
+ * Set up the leaf tail from the freespace block.
+ */
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
+ /*
+ * Set up the leaf bests table.
+ */
+ memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests,
+ freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+ xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
+
+ /*
+ * Get rid of the freespace block.
+ */
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+ fbp);
+ if (error) {
+ /*
+ * This can't fail here because it can only happen when
+ * punching out the middle of an extent, and this is an
+ * isolated block.
+ */
+ ASSERT(error != -ENOSPC);
+ return error;
+ }
+ fbp = NULL;
+ /*
+ * Now see if we can convert the single-leaf directory
+ * down to a block form directory.
+ * This routine always kills the dabuf for the leaf, so
+ * eliminate it from the path.
+ */
+ error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+ state->path.blk[0].bp = NULL;
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000..7a03aeb9f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+ int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+ int index, xfs_da_state_blk_t *dblk,
+ int *rval);
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / geo->free_max_bests);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % geo->free_max_bests;
+}
+
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+static xfs_failaddr_t
+xfs_dir3_leafn_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+ return __this_address;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
+}
+
+static inline void
+xfs_dir3_leaf_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_leafn_check(dp, bp);
+ if (!fa)
+ return;
+ xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount,
+ bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__,
+ fa);
+ ASSERT(0);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static xfs_failaddr_t
+xfs_dir3_free_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+ return NULL;
+}
+
+static void
+xfs_dir3_free_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_dir3_free_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_dir3_free_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_dir3_free_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
+ .verify_read = xfs_dir3_free_read_verify,
+ .verify_write = xfs_dir3_free_write_verify,
+ .verify_struct = xfs_dir3_free_verify,
+};
+
+/* Everything ok in the free block header? */
+static xfs_failaddr_t
+xfs_dir3_free_header_check(
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int maxbests = mp->m_dir_geo->free_max_bests;
+ unsigned int firstdb;
+
+ firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
+ xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
+ maxbests;
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ if (be32_to_cpu(hdr3->firstdb) != firstdb)
+ return __this_address;
+ if (be32_to_cpu(hdr3->nvalid) > maxbests)
+ return __this_address;
+ if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
+ return __this_address;
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ } else {
+ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+ if (be32_to_cpu(hdr->firstdb) != firstdb)
+ return __this_address;
+ if (be32_to_cpu(hdr->nvalid) > maxbests)
+ return __this_address;
+ if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
+ return __this_address;
+ }
+ return NULL;
+}
+
+static int
+__xfs_dir3_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ unsigned int flags,
+ struct xfs_buf **bpp)
+{
+ xfs_failaddr_t fa;
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_free_buf_ops);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ /* try read returns without an error or *bpp if it lands in a hole */
+ if (tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+
+ return 0;
+}
+
+void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from;
+
+ to->magic = be32_to_cpu(from3->hdr.hdr.magic);
+ to->firstdb = be32_to_cpu(from3->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from3->hdr.nvalid);
+ to->nused = be32_to_cpu(from3->hdr.nused);
+ to->bests = from3->bests;
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+ } else {
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ to->bests = from->bests;
+
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+ }
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ to3->hdr.hdr.magic = cpu_to_be32(from->magic);
+ to3->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to3->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to3->hdr.nused = cpu_to_be32(from->nused);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+ }
+}
+
+int
+xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t fbno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_dir3_icfree_hdr hdr;
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+ &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+
+ /*
+ * Initialize the new block to be empty, and remember
+ * its first slot as our empty slot.
+ */
+ memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+ hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
+ } else
+ hdr.magic = XFS_DIR2_FREE_MAGIC;
+ xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+ struct xfs_da_args *args,
+ struct xfs_dir3_icfree_hdr *hdr,
+ struct xfs_buf *bp,
+ int first, /* first entry to log */
+ int last) /* last entry to log */
+{
+ struct xfs_dir2_free *free = bp->b_addr;
+
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+ xfs_trans_log_buf(args->trans, bp,
+ (char *)&hdr->bests[first] - (char *)free,
+ (char *)&hdr->bests[last] - (char *)free +
+ sizeof(hdr->bests[0]) - 1);
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
+{
+#ifdef DEBUG
+ xfs_dir2_free_t *free; /* freespace structure */
+
+ free = bp->b_addr;
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->geo->free_hdr_size - 1);
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int /* error */
+xfs_dir2_leaf_to_node(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *lbp) /* leaf buffer */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ struct xfs_buf *fbp; /* freespace buffer */
+ xfs_dir2_db_t fdb; /* freespace block number */
+ __be16 *from; /* pointer to freespace entry */
+ int i; /* leaf freespace index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ int n; /* count of live freespc ents */
+ xfs_dir2_data_off_t off; /* freespace entry value */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ trace_xfs_dir2_leaf_to_node(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ /*
+ * Add a freespace block to the directory.
+ */
+ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+ return error;
+ }
+ ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+ /*
+ * Get the buffer for the new freespace block.
+ */
+ error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+ if (error)
+ return error;
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr);
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ if (be32_to_cpu(ltp->bestcount) >
+ (uint)dp->i_disk_size / args->geo->blksize) {
+ xfs_buf_mark_corrupt(lbp);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Copy freespace entries from the leaf block to the new block.
+ * Count active entries.
+ */
+ from = xfs_dir2_leaf_bests_p(ltp);
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) {
+ off = be16_to_cpu(*from);
+ if (off != NULLDATAOFF)
+ n++;
+ freehdr.bests[i] = cpu_to_be16(off);
+ }
+
+ /*
+ * Now initialize the freespace block header.
+ */
+ freehdr.nused = n;
+ freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_log_header(args, fbp);
+
+ /*
+ * Converting the leaf to a leafnode is just a matter of changing the
+ * magic number and the ops. Do the change directly to the buffer as
+ * it's less work (and less code) than decoding the header to host
+ * format and back again.
+ */
+ if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ else
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
+ return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int /* error */
+xfs_dir2_leafn_add(
+ struct xfs_buf *bp, /* leaf buffer */
+ struct xfs_da_args *args, /* operation arguments */
+ int index) /* insertion pt for new entry */
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
+ int compact; /* compacting stale leaves */
+ int highstale = 0; /* next stale entry */
+ int lfloghigh; /* high leaf entry logging */
+ int lfloglow; /* low leaf entry logging */
+ int lowstale = 0; /* previous stale entry */
+
+ trace_xfs_dir2_leafn_add(args, index);
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
+
+ /*
+ * Quick check just to make sure we are not going to index
+ * into other peoples memory
+ */
+ if (index < 0) {
+ xfs_buf_mark_corrupt(bp);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * If there are already the maximum number of leaf entries in
+ * the block, if there are no stale entries it won't fit.
+ * Caller will do a split. If there are stale entries we'll do
+ * a compact.
+ */
+
+ if (leafhdr.count == args->geo->leaf_max_ents) {
+ if (!leafhdr.stale)
+ return -ENOSPC;
+ compact = leafhdr.stale > 1;
+ } else
+ compact = 0;
+ ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+ ASSERT(index == leafhdr.count ||
+ be32_to_cpu(ents[index].hashval) >= args->hashval);
+
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ return 0;
+
+ /*
+ * Compact out all but one stale leaf entry. Leaves behind
+ * the entry closest to index.
+ */
+ if (compact)
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
+ else if (leafhdr.stale) {
+ /*
+ * Set impossible logging indices for this case.
+ */
+ lfloglow = leafhdr.count;
+ lfloghigh = -1;
+ }
+
+ /*
+ * Insert the new entry, log everything.
+ */
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+ highstale, &lfloglow, &lfloghigh);
+
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+ args->blkno, args->index));
+
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, bp);
+ return 0;
+}
+
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ xfs_dir2_db_t db)
+{
+ struct xfs_dir3_icfree_hdr hdr;
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr);
+
+ ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0);
+ ASSERT(hdr.firstdb <= db);
+ ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif /* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t /* hash value */
+xfs_dir2_leaf_lasthash(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp, /* leaf buffer */
+ int *count) /* count of entries in leaf */
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+
+ if (count)
+ *count = leafhdr.count;
+ if (!leafhdr.count)
+ return 0;
+ return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+ struct xfs_buf *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_db_t curfdb = -1; /* current free block number */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int fi; /* free entry index */
+ xfs_dir2_free_t *free = NULL; /* free block structure */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int length; /* length of new data entry */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_dir2_db_t newfdb; /* new free block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ ASSERT(leafhdr.count > 0);
+
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ /* If so, it's a free block buffer, get the block number. */
+ curbp = state->extrablk.bp;
+ curfdb = state->extrablk.blkno;
+ free = curbp->b_addr;
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+ }
+ length = xfs_dir2_data_entsize(mp, args->namelen);
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &leafhdr.ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
+ /*
+ * For addname, we're looking for a place to put the new entry.
+ * We want to use a data block with an entry of equal
+ * hash value to ours if there is one with room.
+ *
+ * If this block isn't the data block we already have
+ * in hand, take a look at it.
+ */
+ if (newdb != curdb) {
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ curdb = newdb;
+ /*
+ * Convert the data block to the free block
+ * holding its freespace information.
+ */
+ newfdb = xfs_dir2_db_to_fdb(args->geo, newdb);
+ /*
+ * If it's not the one we have in hand, read it in.
+ */
+ if (newfdb != curfdb) {
+ /*
+ * If we had one before, drop it.
+ */
+ if (curbp)
+ xfs_trans_brelse(tp, curbp);
+
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo,
+ newfdb),
+ &curbp);
+ if (error)
+ return error;
+ free = curbp->b_addr;
+
+ xfs_dir2_free_hdr_check(dp, curbp, curdb);
+ }
+ /*
+ * Get the index for our entry.
+ */
+ fi = xfs_dir2_db_to_fdindex(args->geo, curdb);
+ /*
+ * If it has room, return it.
+ */
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, free);
+ if (XFS_IS_CORRUPT(mp,
+ freehdr.bests[fi] ==
+ cpu_to_be16(NULLDATAOFF))) {
+ if (curfdb != newfdb)
+ xfs_trans_brelse(tp, curbp);
+ return -EFSCORRUPTED;
+ }
+ curfdb = newfdb;
+ if (be16_to_cpu(freehdr.bests[fi]) >= length)
+ goto out;
+ }
+ }
+ /* Didn't find any space */
+ fi = -1;
+out:
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ if (curbp) {
+ /* Giving back a free block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = fi;
+ state->extrablk.blkno = curfdb;
+
+ /*
+ * Important: this magic number is not in the buffer - it's for
+ * buffer type information and therefore only the free/data type
+ * matters here, not whether CRCs are enabled or not.
+ */
+ state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ } else {
+ state->extravalid = 0;
+ }
+ /*
+ * Return the index, that will be the insertion point.
+ */
+ *indexp = index;
+ return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+ struct xfs_buf *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ if (leafhdr.count <= 0) {
+ xfs_buf_mark_corrupt(bp);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ curbp = state->extrablk.bp;
+ curdb = state->extrablk.blkno;
+ }
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &leafhdr.ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
+ /*
+ * Not adding a new entry, so we really want to find
+ * the name given to us.
+ *
+ * If it's a different data block, go get it.
+ */
+ if (newdb != curdb) {
+ /*
+ * If we had a block before that we aren't saving
+ * for a CI name, drop it
+ */
+ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+ curdb != state->extrablk.blkno))
+ xfs_trans_brelse(tp, curbp);
+ /*
+ * If needing the block that is saved with a CI match,
+ * use it otherwise read in the new data block.
+ */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ newdb == state->extrablk.blkno) {
+ ASSERT(state->extravalid);
+ curbp = state->extrablk.bp;
+ } else {
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo,
+ newdb),
+ 0, &curbp);
+ if (error)
+ return error;
+ }
+ xfs_dir3_data_check(dp, curbp);
+ curdb = newdb;
+ }
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
+ /*
+ * Compare the entry and if it's an exact match, return
+ * EEXIST immediately. If it's the first case-insensitive
+ * match, store the block & inode number and continue looking.
+ */
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ /* If there is a CI match block, drop it */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ curdb != state->extrablk.blkno)
+ xfs_trans_brelse(tp, state->extrablk.bp);
+ args->cmpresult = cmp;
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = xfs_dir2_data_get_ftype(mp, dep);
+ *indexp = index;
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.blkno = curdb;
+ state->extrablk.index = (int)((char *)dep -
+ (char *)curbp->b_addr);
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+ if (cmp == XFS_CMP_EXACT)
+ return -EEXIST;
+ }
+ }
+ ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+ if (curbp) {
+ if (args->cmpresult == XFS_CMP_DIFFERENT) {
+ /* Giving back last used data block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = -1;
+ state->extrablk.blkno = curdb;
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+ } else {
+ /* If the curbp is not the CI match block, drop it */
+ if (state->extrablk.bp != curbp)
+ xfs_trans_brelse(tp, curbp);
+ }
+ } else {
+ state->extravalid = 0;
+ }
+ *indexp = index;
+ return -ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+ struct xfs_buf *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ if (args->op_flags & XFS_DA_OP_ADDNAME)
+ return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+ state);
+ return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers. Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *bp_s, /* source */
+ struct xfs_dir3_icleaf_hdr *shdr,
+ struct xfs_dir2_leaf_entry *sents,
+ int start_s,/* source leaf index */
+ struct xfs_buf *bp_d, /* destination */
+ struct xfs_dir3_icleaf_hdr *dhdr,
+ struct xfs_dir2_leaf_entry *dents,
+ int start_d,/* destination leaf index */
+ int count) /* count of leaves to copy */
+{
+ int stale; /* count stale leaves copied */
+
+ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+ /*
+ * Silently return if nothing to do.
+ */
+ if (count == 0)
+ return;
+
+ /*
+ * If the destination index is not the end of the current
+ * destination leaf entries, open up a hole in the destination
+ * to hold the new entries.
+ */
+ if (start_d < dhdr->count) {
+ memmove(&dents[start_d + count], &dents[start_d],
+ (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count,
+ count + dhdr->count - 1);
+ }
+ /*
+ * If the source has stale leaves, count the ones in the copy range
+ * so we can update the header correctly.
+ */
+ if (shdr->stale) {
+ int i; /* temp leaf index */
+
+ for (i = start_s, stale = 0; i < start_s + count; i++) {
+ if (sents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ } else
+ stale = 0;
+ /*
+ * Copy the leaf entries from source to destination.
+ */
+ memcpy(&dents[start_d], &sents[start_s],
+ count * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1);
+
+ /*
+ * If there are source entries after the ones we copied,
+ * delete the ones we copied by sliding the next ones down.
+ */
+ if (start_s + count < shdr->count) {
+ memmove(&sents[start_s], &sents[start_s + count],
+ count * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s,
+ start_s + count - 1);
+ }
+
+ /*
+ * Update the headers and log them.
+ */
+ shdr->count -= count;
+ shdr->stale -= stale;
+ dhdr->count += count;
+ dhdr->stale += stale;
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int /* sort order */
+xfs_dir2_leafn_order(
+ struct xfs_inode *dp,
+ struct xfs_buf *leaf1_bp, /* leaf1 buffer */
+ struct xfs_buf *leaf2_bp) /* leaf2 buffer */
+{
+ struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr;
+ struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
+
+ if (hdr1.count > 0 && hdr2.count > 0 &&
+ (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+ be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+ be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+ return 1;
+ return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+ xfs_da_state_t *state, /* btree cursor */
+ xfs_da_state_blk_t *blk1, /* first btree block */
+ xfs_da_state_blk_t *blk2) /* second btree block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ int count; /* count (& direction) leaves */
+ int isleft; /* new goes in left leaf */
+ xfs_dir2_leaf_t *leaf1; /* first leaf structure */
+ xfs_dir2_leaf_t *leaf2; /* second leaf structure */
+ int mid; /* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+ int oldstale; /* old count of stale leaves */
+#endif
+ int oldsum; /* old total leaf count */
+ int swap_blocks; /* swapped leaf blocks */
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+ struct xfs_inode *dp = state->args->dp;
+
+ args = state->args;
+ /*
+ * If the block order is wrong, swap the arguments.
+ */
+ swap_blocks = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp);
+ if (swap_blocks)
+ swap(blk1, blk2);
+
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
+
+ oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+ oldstale = hdr1.stale + hdr2.stale;
+#endif
+ mid = oldsum >> 1;
+
+ /*
+ * If the old leaf count was odd then the new one will be even,
+ * so we need to divide the new count evenly.
+ */
+ if (oldsum & 1) {
+ xfs_dahash_t midhash; /* middle entry hash value */
+
+ if (mid >= hdr1.count)
+ midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+ else
+ midhash = be32_to_cpu(ents1[mid].hashval);
+ isleft = args->hashval <= midhash;
+ }
+ /*
+ * If the old count is even then the new count is odd, so there's
+ * no preferred side for the new entry.
+ * Pick the left one.
+ */
+ else
+ isleft = 1;
+ /*
+ * Calculate moved entry count. Positive means left-to-right,
+ * negative means right-to-left. Then move the entries.
+ */
+ count = hdr1.count - mid + (isleft == 0);
+ if (count > 0)
+ xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+ hdr1.count - count, blk2->bp,
+ &hdr2, ents2, 0, count);
+ else if (count < 0)
+ xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+ blk1->bp, &hdr1, ents1,
+ hdr1.count, count);
+
+ ASSERT(hdr1.count + hdr2.count == oldsum);
+ ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+ /* log the changes made when moving the entries */
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2);
+ xfs_dir3_leaf_log_header(args, blk1->bp);
+ xfs_dir3_leaf_log_header(args, blk2->bp);
+
+ xfs_dir3_leaf_check(dp, blk1->bp);
+ xfs_dir3_leaf_check(dp, blk2->bp);
+
+ /*
+ * Mark whether we're inserting into the old or new leaf.
+ */
+ if (hdr1.count < hdr2.count)
+ state->inleaf = swap_blocks;
+ else if (hdr1.count > hdr2.count)
+ state->inleaf = !swap_blocks;
+ else
+ state->inleaf = swap_blocks ^ (blk1->index <= hdr1.count);
+ /*
+ * Adjust the expected index for insertion.
+ */
+ if (!state->inleaf)
+ blk2->index = blk1->index - hdr1.count;
+
+ /*
+ * Finally sanity check just to make sure we are not returning a
+ * negative index
+ */
+ if (blk2->index < 0) {
+ state->inleaf = 1;
+ blk2->index = 0;
+ xfs_alert(dp->i_mount,
+ "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+ __func__, blk1->index);
+ }
+}
+
+static int
+xfs_dir3_data_block_free(
+ xfs_da_args_t *args,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_free *free,
+ xfs_dir2_db_t fdb,
+ int findex,
+ struct xfs_buf *fbp,
+ int longest)
+{
+ int logfree = 0;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_inode *dp = args->dp;
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+ if (hdr) {
+ /*
+ * Data block is not empty, just set the free entry to the new
+ * value.
+ */
+ freehdr.bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+ return 0;
+ }
+
+ /* One less used entry in the free table. */
+ freehdr.nused--;
+
+ /*
+ * If this was the last entry in the table, we can trim the table size
+ * back. There might be other entries at the end referring to
+ * non-existent data blocks, get those too.
+ */
+ if (findex == freehdr.nvalid - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
+ }
+ freehdr.nvalid = i + 1;
+ logfree = 0;
+ } else {
+ /* Not the last entry, just punch it out. */
+ freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
+
+ /*
+ * If there are no useful entries left in the block, get rid of the
+ * block if we can.
+ */
+ if (!freehdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != -ENOSPC || args->total != 0)
+ return error;
+ /*
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
+ */
+ }
+
+ /* Log the free entry that changed, unless we got rid of it. */
+ if (logfree)
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+ return 0;
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int /* error */
+xfs_dir2_leafn_remove(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *bp, /* leaf buffer */
+ int index, /* leaf entry index */
+ xfs_da_state_blk_t *dblk, /* data block */
+ int *rval) /* resulting block needs join */
+{
+ struct xfs_da_geometry *geo = args->geo;
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_db_t db; /* data block number */
+ struct xfs_buf *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int longest; /* longest data free entry */
+ int off; /* data block entry offset */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data frees */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leafn_remove(args, index);
+
+ dp = args->dp;
+ tp = args->trans;
+ leaf = bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+
+ /*
+ * Point to the entry we're removing.
+ */
+ lep = &leafhdr.ents[index];
+
+ /*
+ * Extract the data block and offset from the entry.
+ */
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
+ ASSERT(dblk->blkno == db);
+ off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address));
+ ASSERT(dblk->index == off);
+
+ /*
+ * Kill the leaf entry by marking it stale.
+ * Log the leaf block changes.
+ */
+ leafhdr.stale++;
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index);
+
+ /*
+ * Make the data entry free. Keep track of the longest freespace
+ * in the data block in case it changes.
+ */
+ dbp = dblk->bp;
+ hdr = dbp->b_addr;
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+ longest = be16_to_cpu(bf[0].length);
+ needlog = needscan = 0;
+ xfs_dir2_data_make_free(args, dbp, off,
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
+ /*
+ * Rescan the data block freespaces for bestfree.
+ * Log the data block header if needed.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_data_check(dp, dbp);
+ /*
+ * If the longest data block freespace changes, need to update
+ * the corresponding freeblock entry.
+ */
+ if (longest < be16_to_cpu(bf[0].length)) {
+ int error; /* error return value */
+ struct xfs_buf *fbp; /* freeblock buffer */
+ xfs_dir2_db_t fdb; /* freeblock block number */
+ int findex; /* index in freeblock entries */
+ xfs_dir2_free_t *free; /* freeblock structure */
+
+ /*
+ * Convert the data block number to a free block,
+ * read in the free block.
+ */
+ fdb = xfs_dir2_db_to_fdb(geo, db);
+ error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
+ &fbp);
+ if (error)
+ return error;
+ free = fbp->b_addr;
+#ifdef DEBUG
+ {
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+ ASSERT(freehdr.firstdb == geo->free_max_bests *
+ (fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET)));
+ }
+#endif
+ /*
+ * Calculate which entry we need to fix.
+ */
+ findex = xfs_dir2_db_to_fdindex(geo, db);
+ longest = be16_to_cpu(bf[0].length);
+ /*
+ * If the data block is now empty we can get rid of it
+ * (usually).
+ */
+ if (longest == geo->blksize - geo->data_entry_offset) {
+ /*
+ * Try to punch out the data block.
+ */
+ error = xfs_dir2_shrink_inode(args, db, dbp);
+ if (error == 0) {
+ dblk->bp = NULL;
+ hdr = NULL;
+ }
+ /*
+ * We can get ENOSPC if there's no space reservation.
+ * In this case just drop the buffer and some one else
+ * will eventually get rid of the empty block.
+ */
+ else if (!(error == -ENOSPC && args->total == 0))
+ return error;
+ }
+ /*
+ * If we got rid of the data block, we can eliminate that entry
+ * in the free block.
+ */
+ error = xfs_dir3_data_block_free(args, hdr, free,
+ fdb, findex, fbp, longest);
+ if (error)
+ return error;
+ }
+
+ xfs_dir3_leaf_check(dp, bp);
+ /*
+ * Return indication of whether this leaf block is empty enough
+ * to justify trying to join it with a neighbor.
+ */
+ *rval = (geo->leaf_hdr_size +
+ (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) <
+ geo->magicpct;
+ return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int /* error */
+xfs_dir2_leafn_split(
+ xfs_da_state_t *state, /* btree cursor */
+ xfs_da_state_blk_t *oldblk, /* original block */
+ xfs_da_state_blk_t *newblk) /* newly created block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_dablk_t blkno; /* new leaf block number */
+ int error; /* error return value */
+ struct xfs_inode *dp;
+
+ /*
+ * Allocate space for a new leaf node.
+ */
+ args = state->args;
+ dp = args->dp;
+ ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error) {
+ return error;
+ }
+ /*
+ * Initialize the new leaf block.
+ */
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+ &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+ if (error)
+ return error;
+
+ newblk->blkno = blkno;
+ newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+ /*
+ * Rebalance the entries across the two leaves, link the new
+ * block into the leaves.
+ */
+ xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
+ if (error) {
+ return error;
+ }
+ /*
+ * Insert the new entry in the correct block.
+ */
+ if (state->inleaf)
+ error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+ else
+ error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+ /*
+ * Update last hashval in each block since we added the name.
+ */
+ oldblk->hashval = xfs_dir2_leaf_lasthash(dp, oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leaf_lasthash(dp, newblk->bp, NULL);
+ xfs_dir3_leaf_check(dp, oldblk->bp);
+ xfs_dir3_leaf_check(dp, newblk->bp);
+ return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int /* error */
+xfs_dir2_leafn_toosmall(
+ xfs_da_state_t *state, /* btree cursor */
+ int *action) /* resulting action to take */
+{
+ xfs_da_state_blk_t *blk; /* leaf block */
+ xfs_dablk_t blkno; /* leaf block number */
+ struct xfs_buf *bp; /* leaf buffer */
+ int bytes; /* bytes in use */
+ int count; /* leaf live entry count */
+ int error; /* error return value */
+ int forward; /* sibling block direction */
+ int i; /* sibling counter */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int rval; /* result from path_shift */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = state->args->dp;
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ leaf = blk->bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
+ xfs_dir3_leaf_check(dp, blk->bp);
+
+ count = leafhdr.count - leafhdr.stale;
+ bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]);
+ if (bytes > (state->args->geo->blksize >> 1)) {
+ /*
+ * Blk over 50%, don't try to join.
+ */
+ *action = 0;
+ return 0;
+ }
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (leafhdr.forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+ &rval);
+ if (error)
+ return error;
+ *action = rval ? 2 : 0;
+ return 0;
+ }
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink a directory over time.
+ */
+ forward = leafhdr.forw < leafhdr.back;
+ for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ blkno = forward ? leafhdr.forw : leafhdr.back;
+ if (blkno == 0)
+ continue;
+ /*
+ * Read the sibling leaf block.
+ */
+ error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
+ if (error)
+ return error;
+
+ /*
+ * Count bytes in the two blocks combined.
+ */
+ count = leafhdr.count - leafhdr.stale;
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2);
+
+ leaf = bp->b_addr;
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf);
+ ents = hdr2.ents;
+ count += hdr2.count - hdr2.stale;
+ bytes -= count * sizeof(ents[0]);
+
+ /*
+ * Fits with at least 25% to spare.
+ */
+ if (bytes >= 0)
+ break;
+ xfs_trans_brelse(state->args->trans, bp);
+ }
+ /*
+ * Didn't like either block, give up.
+ */
+ if (i >= 2) {
+ *action = 0;
+ return 0;
+ }
+
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno)
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+ &rval);
+ else
+ error = xfs_da3_path_shift(state, &state->path, forward, 0,
+ &rval);
+ if (error) {
+ return error;
+ }
+ *action = rval ? 0 : 1;
+ return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+ xfs_da_state_t *state, /* cursor */
+ xfs_da_state_blk_t *drop_blk, /* dead block */
+ xfs_da_state_blk_t *save_blk) /* surviving block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
+ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
+ struct xfs_dir3_icleaf_hdr savehdr;
+ struct xfs_dir3_icleaf_hdr drophdr;
+ struct xfs_dir2_leaf_entry *sents;
+ struct xfs_dir2_leaf_entry *dents;
+ struct xfs_inode *dp = state->args->dp;
+
+ args = state->args;
+ ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
+
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf);
+ sents = savehdr.ents;
+ dents = drophdr.ents;
+
+ /*
+ * If there are any stale leaf entries, take this opportunity
+ * to purge them.
+ */
+ if (drophdr.stale)
+ xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+ if (savehdr.stale)
+ xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
+ /*
+ * Move the entries from drop to the appropriate end of save.
+ */
+ drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+ if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents, 0,
+ drophdr.count);
+ else
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents,
+ savehdr.count, drophdr.count);
+ save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+ /* log the changes made when moving the entries */
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr);
+ xfs_dir3_leaf_log_header(args, save_blk->bp);
+ xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+ xfs_dir3_leaf_check(dp, save_blk->bp);
+ xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+
+/*
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
+ */
+static int
+xfs_dir2_node_add_datablk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbno,
+ struct xfs_buf **dbpp,
+ struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
+ int *findex)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_data_free *bf;
+ xfs_dir2_db_t fbno;
+ struct xfs_buf *fbp;
+ struct xfs_buf *dbp;
+ int error;
+
+ /* Not allowed to allocate, return failure. */
+ if (args->total == 0)
+ return -ENOSPC;
+
+ /* Allocate and initialize the new data block. */
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+ if (error)
+ return error;
+ error = xfs_dir3_data_init(args, *dbno, &dbp);
+ if (error)
+ return error;
+
+ /*
+ * Get the freespace block corresponding to the data block
+ * that was just allocated.
+ */
+ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+ if (error)
+ return error;
+
+ /*
+ * If there wasn't a freespace block, the read will
+ * return a NULL fbp. Allocate and initialize a new one.
+ */
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp,
+ xfs_dir2_db_to_fdb(args->geo, *dbno) !=
+ fbno)) {
+ xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+ __func__, (unsigned long long)dp->i_ino,
+ (long long)xfs_dir2_db_to_fdb(args->geo, *dbno),
+ (long long)*dbno, (long long)fbno);
+ if (fblk) {
+ xfs_alert(mp,
+ " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+ fblk, (unsigned long long)fblk->blkno,
+ fblk->index, fblk->magic);
+ } else {
+ xfs_alert(mp, " ... fblk is NULL");
+ }
+ return -EFSCORRUPTED;
+ }
+
+ /* Get a buffer for the new block. */
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+ if (error)
+ return error;
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
+
+ /* Remember the first slot as our empty slot. */
+ hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ args->geo->free_max_bests;
+ } else {
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
+ }
+
+ /* Set the freespace block index from the data block number. */
+ *findex = xfs_dir2_db_to_fdindex(args->geo, *dbno);
+
+ /* Extend the freespace table if the new data block is off the end. */
+ if (*findex >= hdr->nvalid) {
+ ASSERT(*findex < args->geo->free_max_bests);
+ hdr->nvalid = *findex + 1;
+ hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF);
+ }
+
+ /*
+ * If this entry was for an empty data block (this should always be
+ * true) then update the header.
+ */
+ if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+ hdr->nused++;
+ xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr);
+ xfs_dir2_free_log_header(args, fbp);
+ }
+
+ /* Update the freespace value for the new block in the table. */
+ bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr);
+ hdr->bests[*findex] = bf[0].length;
+
+ *dbpp = dbp;
+ *fbpp = fbp;
+ return 0;
+}
+
+static int
+xfs_dir2_node_find_freeblk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbnop,
+ struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
+ int *findexp,
+ int length)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_buf *fbp = NULL;
+ xfs_dir2_db_t firstfbno;
+ xfs_dir2_db_t lastfbno;
+ xfs_dir2_db_t ifbno = -1;
+ xfs_dir2_db_t dbno = -1;
+ xfs_dir2_db_t fbno;
+ xfs_fileoff_t fo;
+ int findex = 0;
+ int error;
+
+ /*
+ * If we came in with a freespace block that means that lookup
+ * found an entry with our hash value. This is the freespace
+ * block for that data entry.
+ */
+ if (fblk) {
+ fbp = fblk->bp;
+ findex = fblk->index;
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
+ if (findex >= 0) {
+ /* caller already found the freespace for us. */
+ ASSERT(findex < hdr->nvalid);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) >= length);
+ dbno = hdr->firstdb + findex;
+ goto found_block;
+ }
+
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
+ ifbno = fblk->blkno;
+ xfs_trans_brelse(tp, fbp);
+ fbp = NULL;
+ fblk->bp = NULL;
+ }
+
+ /*
+ * If we don't have a data block yet, we're going to scan the freespace
+ * data for a data block with enough free space in it.
+ */
+ error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+ if (error)
+ return error;
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+ firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
+
+ for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+ /* If it's ifbno we already looked at it. */
+ if (fbno == ifbno)
+ continue;
+
+ /*
+ * Read the block. There can be holes in the freespace blocks,
+ * so this might not succeed. This should be really rare, so
+ * there's no reason to avoid it.
+ */
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
+ if (error)
+ return error;
+ if (!fbp)
+ continue;
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
+
+ /* Scan the free entry array for a large enough free space. */
+ for (findex = hdr->nvalid - 1; findex >= 0; findex--) {
+ if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(hdr->bests[findex]) >= length) {
+ dbno = hdr->firstdb + findex;
+ goto found_block;
+ }
+ }
+
+ /* Didn't find free space, go on to next free block */
+ xfs_trans_brelse(tp, fbp);
+ }
+
+found_block:
+ *dbnop = dbno;
+ *fbpp = fbp;
+ *findexp = findex;
+ return 0;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+ struct xfs_da_args *args, /* operation arguments */
+ struct xfs_da_state_blk *fblk) /* optional freespace block */
+{
+ struct xfs_dir2_data_unused *dup; /* data unused entry pointer */
+ struct xfs_dir2_data_entry *dep; /* data entry pointer */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_buf *dbp; /* data block buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
+ xfs_dir2_data_aoff_t aoff;
+ xfs_dir2_db_t dbno; /* data block number */
+ int error; /* error return value */
+ int findex; /* freespace entry index */
+ int length; /* length of the new entry */
+ int logfree = 0; /* need to log free entry */
+ int needlog = 0; /* need to log data header */
+ int needscan = 0; /* need to rescan data frees */
+ __be16 *tagp; /* data entry tag pointer */
+
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+ error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr,
+ &findex, length);
+ if (error)
+ return error;
+
+ /*
+ * Now we know if we must allocate blocks, so if we are checking whether
+ * we can insert without allocation then we can return now.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ if (dbno == -1)
+ return -ENOSPC;
+ return 0;
+ }
+
+ /*
+ * If we don't have a data block, we need to allocate one and make
+ * the freespace entries refer to it.
+ */
+ if (dbno == -1) {
+ /* we're going to have to log the free block index later */
+ logfree = 1;
+ error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+ &freehdr, &findex);
+ } else {
+ /* Read the data block in. */
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
+ 0, &dbp);
+ }
+ if (error)
+ return error;
+
+ /* setup for data block up now */
+ hdr = dbp->b_addr;
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+ ASSERT(be16_to_cpu(bf[0].length) >= length);
+
+ /* Point to the existing unused space. */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+
+ /* Mark the first part of the unused space, inuse for us. */
+ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+ error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+ &needlog, &needscan);
+ if (error) {
+ xfs_trans_brelse(tp, dbp);
+ return error;
+ }
+
+ /* Fill in the new entry and log it. */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, dep->namelen);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+
+ /* Rescan the freespace and log the data block if needed. */
+ if (needscan)
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+
+ /* If the freespace block entry is now wrong, update it. */
+ if (freehdr.bests[findex] != bf[0].length) {
+ freehdr.bests[findex] = bf[0].length;
+ logfree = 1;
+ }
+
+ /* Log the freespace entry if needed. */
+ if (logfree)
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
+
+ /* Return the data block and offset in args. */
+ args->blkno = (xfs_dablk_t)dbno;
+ args->index = be16_to_cpu(*tagp);
+ return 0;
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int /* error */
+xfs_dir2_node_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block for insert */
+ int error; /* error return value */
+ int rval; /* sub-return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_addname(args);
+
+ /*
+ * Allocate and initialize the state (btree cursor).
+ */
+ state = xfs_da_state_alloc(args);
+ /*
+ * Look up the name. We're not supposed to find it, but
+ * this gives us the insertion point.
+ */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ if (rval != -ENOENT) {
+ goto done;
+ }
+ /*
+ * Add the data entry to a data block.
+ * Extravalid is set to a freeblock found by lookup.
+ */
+ rval = xfs_dir2_node_addname_int(args,
+ state->extravalid ? &state->extrablk : NULL);
+ if (rval) {
+ goto done;
+ }
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Add the new leaf entry.
+ */
+ rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+ if (rval == 0) {
+ /*
+ * It worked, fix the hash values up the btree.
+ */
+ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+ xfs_da3_fixhashpath(state, &state->path);
+ } else {
+ /*
+ * It didn't work, we need to split the leaf block.
+ */
+ if (args->total == 0) {
+ ASSERT(rval == -ENOSPC);
+ goto done;
+ }
+ /*
+ * Split the leaf block and insert the new entry.
+ */
+ rval = xfs_da3_split(state);
+ }
+done:
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int /* error */
+xfs_dir2_node_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ int error; /* error return value */
+ int i; /* btree level */
+ int rval; /* operation return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_lookup(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc(args);
+
+ /*
+ * Fill in the path to the entry in the cursor.
+ */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+ /* If a CI match, dup the actual name and return -EEXIST */
+ xfs_dir2_data_entry_t *dep;
+
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)state->extrablk.bp->b_addr +
+ state->extrablk.index);
+ rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ }
+ /*
+ * Release the btree blocks and leaf block.
+ */
+ for (i = 0; i < state->path.active; i++) {
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+ /*
+ * Release the data block if we have it.
+ */
+ if (state->extravalid && state->extrablk.bp) {
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
+ state->extrablk.bp = NULL;
+ }
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int /* error */
+xfs_dir2_node_removename(
+ struct xfs_da_args *args) /* operation arguments */
+{
+ struct xfs_da_state_blk *blk; /* leaf block */
+ int error; /* error return value */
+ int rval; /* operation return value */
+ struct xfs_da_state *state; /* btree cursor */
+
+ trace_xfs_dir2_node_removename(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc(args);
+
+ /* Look up the entry we're deleting, set up the cursor. */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error)
+ goto out_free;
+
+ /* Didn't find it, upper layer screwed up. */
+ if (rval != -EEXIST) {
+ error = rval;
+ goto out_free;
+ }
+
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(state->extravalid);
+ /*
+ * Remove the leaf and data entries.
+ * Extrablk refers to the data block.
+ */
+ error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+ &state->extrablk, &rval);
+ if (error)
+ goto out_free;
+ /*
+ * Fix the hash values up the btree.
+ */
+ xfs_da3_fixhashpath(state, &state->path);
+ /*
+ * If we need to join leaf blocks, do it.
+ */
+ if (rval && state->path.active > 1)
+ error = xfs_da3_join(state);
+ /*
+ * If no errors so far, try conversion to leaf format.
+ */
+ if (!error)
+ error = xfs_dir2_node_to_leaf(state);
+out_free:
+ xfs_da_state_free(state);
+ return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int /* error */
+xfs_dir2_node_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_data_entry_t *dep; /* data entry changed */
+ int error; /* error return value */
+ int i; /* btree level */
+ xfs_ino_t inum; /* new inode number */
+ int ftype; /* new file type */
+ int rval; /* internal return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_replace(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc(args);
+
+ /*
+ * We have to save new inode number and ftype since
+ * xfs_da3_node_lookup_int() is going to overwrite them
+ */
+ inum = args->inumber;
+ ftype = args->filetype;
+
+ /*
+ * Lookup the entry to change in the btree.
+ */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error) {
+ rval = error;
+ }
+ /*
+ * It should be found, since the vnodeops layer has looked it up
+ * and locked it. But paranoia is good.
+ */
+ if (rval == -EEXIST) {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ /*
+ * Find the leaf entry.
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(state->extravalid);
+
+ xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr,
+ blk->bp->b_addr);
+ /*
+ * Point to the data entry.
+ */
+ hdr = state->extrablk.bp->b_addr;
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(leafhdr.ents[blk->index].address)));
+ ASSERT(inum != be64_to_cpu(dep->inumber));
+ /*
+ * Fill in the new inode number and log the entry.
+ */
+ dep->inumber = cpu_to_be64(inum);
+ xfs_dir2_data_put_ftype(state->mp, dep, ftype);
+ xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+ rval = 0;
+ }
+ /*
+ * Didn't find it, and we're holding a data block. Drop it.
+ */
+ else if (state->extravalid) {
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
+ state->extrablk.bp = NULL;
+ }
+ /*
+ * Release all the buffers in the cursor.
+ */
+ for (i = 0; i < state->path.active; i++) {
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int /* error */
+xfs_dir2_node_trim_free(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_fileoff_t fo, /* free block number */
+ int *rvalp) /* out: did something */
+{
+ struct xfs_buf *bp; /* freespace buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dir2_free_t *free; /* freespace structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ dp = args->dp;
+ tp = args->trans;
+
+ *rvalp = 0;
+
+ /*
+ * Read the freespace block.
+ */
+ error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ if (error)
+ return error;
+ /*
+ * There can be holes in freespace. If fo is a hole, there's
+ * nothing to do.
+ */
+ if (!bp)
+ return 0;
+ free = bp->b_addr;
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+
+ /*
+ * If there are used entries, there's nothing to do.
+ */
+ if (freehdr.nused > 0) {
+ xfs_trans_brelse(tp, bp);
+ return 0;
+ }
+ /*
+ * Blow the block away.
+ */
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+ if (error) {
+ /*
+ * Can't fail with ENOSPC since that only happens with no
+ * space reservation, when breaking up an extent into two
+ * pieces. This is the last block of an extent.
+ */
+ ASSERT(error != -ENOSPC);
+ xfs_trans_brelse(tp, bp);
+ return error;
+ }
+ /*
+ * Return that we succeeded.
+ */
+ *rvalp = 1;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000..7404a9ff1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/*
+ * In-core version of the leaf and free block headers to abstract the
+ * differences in the v2 and v3 disk format of the headers.
+ */
+struct xfs_dir3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t stale;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_dir2_leaf_entry *ents;
+};
+
+struct xfs_dir3_icfree_hdr {
+ uint32_t magic;
+ uint32_t firstdb;
+ uint32_t nvalid;
+ uint32_t nused;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ __be16 *bests;
+};
+
+/* xfs_dir2.c */
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
+enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+ xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+ struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr);
+__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+void xfs_dir2_data_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep, uint8_t ftype);
+
+#ifdef DEBUG
+extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#else
+#define xfs_dir3_data_check(dp,bp)
+#endif
+
+extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
+ struct xfs_buf *bp);
+int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ unsigned int flags);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+ int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+ struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from);
+void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+ struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int *indexp,
+ int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+ struct xfs_buf **bpp, uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first,
+ int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+ struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+ struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int index, int compact,
+ int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf,
+ bool expensive_checks);
+
+/* xfs_dir2_node.c */
+void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from);
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+ struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp,
+ struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+ struct xfs_da_args *args, int *indexp,
+ struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+ int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr);
+void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino);
+uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep);
+struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep);
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+ int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+int xfs_dir2_sf_entsize(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, int len);
+void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino);
+void xfs_dir2_sf_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep, uint8_t ftype);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct dir_context *ctx, size_t bufsize);
+
+static inline unsigned int
+xfs_dir2_data_entsize(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int len;
+
+ len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
+ sizeof(xfs_dir2_data_off_t) /* tag */;
+ if (xfs_has_ftype(mp))
+ len += sizeof(uint8_t);
+ return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+ const struct xfs_name *name);
+enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000..8cd37e6e9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1293 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+ xfs_dir2_sf_entry_t *sfep,
+ xfs_dir2_data_aoff_t offset,
+ int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+ int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+ xfs_dir2_sf_entry_t **sfepp,
+ xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+
+int
+xfs_dir2_sf_entsize(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = len;
+
+ count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
+
+ if (xfs_has_ftype(mp))
+ count += sizeof(uint8_t);
+ return count;
+}
+
+struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. The actual inode numbers can
+ * come in two formats as well, either 4 bytes or 8 bytes wide.
+ */
+xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ uint8_t *from = sfep->name + sfep->namelen;
+
+ if (xfs_has_ftype(mp))
+ from++;
+
+ if (!hdr->i8count)
+ return get_unaligned_be32(from);
+ return get_unaligned_be64(from) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ uint8_t *to = sfep->name + sfep->namelen;
+
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (xfs_has_ftype(mp))
+ to++;
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, to);
+ else
+ put_unaligned_be32(ino, to);
+}
+
+xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ if (!hdr->i8count)
+ return get_unaligned_be32(hdr->parent);
+ return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, hdr->parent);
+ else
+ put_unaligned_be32(ino, hdr->parent);
+}
+
+/*
+ * The file type field is stored at the end of the name for filetype enabled
+ * shortform directories, or not at all otherwise.
+ */
+uint8_t
+xfs_dir2_sf_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ if (xfs_has_ftype(mp)) {
+ uint8_t ftype = sfep->name[sfep->namelen];
+
+ if (ftype < XFS_DIR3_FT_MAX)
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_sf_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ if (xfs_has_ftype(mp))
+ sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode. If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int /* size for sf form */
+xfs_dir2_block_sfsize(
+ xfs_inode_t *dp, /* incore inode pointer */
+ xfs_dir2_data_hdr_t *hdr, /* block directory data */
+ xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */
+{
+ xfs_dir2_dataptr_t addr; /* data entry address */
+ xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */
+ xfs_dir2_block_tail_t *btp; /* tail area of the block */
+ int count; /* shortform entry count */
+ xfs_dir2_data_entry_t *dep; /* data entry in the block */
+ int i; /* block entry index */
+ int i8count; /* count of big-inode entries */
+ int isdot; /* entry is "." */
+ int isdotdot; /* entry is ".." */
+ xfs_mount_t *mp; /* mount structure pointer */
+ int namelen; /* total name bytes */
+ xfs_ino_t parent = 0; /* parent inode number */
+ int size=0; /* total computed size */
+ int has_ftype;
+ struct xfs_da_geometry *geo;
+
+ mp = dp->i_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * if there is a filetype field, add the extra byte to the namelen
+ * for each entry that we see.
+ */
+ has_ftype = xfs_has_ftype(mp) ? 1 : 0;
+
+ count = i8count = namelen = 0;
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Iterate over the block's data entries by using the leaf pointers.
+ */
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Calculate the pointer to the entry at hand.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, addr));
+ /*
+ * Detect . and .., so we can special-case them.
+ * . is not included in sf directories.
+ * .. is included by just the parent inode number.
+ */
+ isdot = dep->namelen == 1 && dep->name[0] == '.';
+ isdotdot =
+ dep->namelen == 2 &&
+ dep->name[0] == '.' && dep->name[1] == '.';
+
+ if (!isdot)
+ i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+
+ /* take into account the file type field */
+ if (!isdot && !isdotdot) {
+ count++;
+ namelen += dep->namelen + has_ftype;
+ } else if (isdotdot)
+ parent = be64_to_cpu(dep->inumber);
+ /*
+ * Calculate the new size, see if we should give up yet.
+ */
+ size = xfs_dir2_sf_hdr_size(i8count) + /* header */
+ count * 3 * sizeof(u8) + /* namelen + offset */
+ namelen + /* name */
+ (i8count ? /* inumber */
+ count * XFS_INO64_SIZE :
+ count * XFS_INO32_SIZE);
+ if (size > xfs_inode_data_fork_size(dp))
+ return size; /* size value is a failure */
+ }
+ /*
+ * Create the output header, if it worked.
+ */
+ sfhp->count = count;
+ sfhp->i8count = i8count;
+ xfs_dir2_sf_put_parent_ino(sfhp, parent);
+ return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int /* error */
+xfs_dir2_block_to_sf(
+ struct xfs_da_args *args, /* operation arguments */
+ struct xfs_buf *bp,
+ int size, /* shortform directory size */
+ struct xfs_dir2_sf_hdr *sfhp) /* shortform directory hdr */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int error; /* error return value */
+ int logflags; /* inode logging flags */
+ struct xfs_dir2_sf_entry *sfep; /* shortform entry */
+ struct xfs_dir2_sf_hdr *sfp; /* shortform directory header */
+ unsigned int offset = args->geo->data_entry_offset;
+ unsigned int end;
+
+ trace_xfs_dir2_block_to_sf(args);
+
+ /*
+ * Allocate a temporary destination buffer the size of the inode to
+ * format the data into. Once we have formatted the data, we can free
+ * the block and copy the formatted data into the inode literal area.
+ */
+ sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+ memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
+ /*
+ * Loop over the active and unused entries. Stop when we reach the
+ * leaf/tail portion of the block.
+ */
+ end = xfs_dir3_data_end_offset(args->geo, bp->b_addr);
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ /*
+ * If it's unused, just skip over it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /*
+ * Skip .
+ */
+ if (dep->namelen == 1 && dep->name[0] == '.')
+ ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+ /*
+ * Skip .., but make sure the inode number is right.
+ */
+ else if (dep->namelen == 2 &&
+ dep->name[0] == '.' && dep->name[1] == '.')
+ ASSERT(be64_to_cpu(dep->inumber) ==
+ xfs_dir2_sf_get_parent_ino(sfp));
+ /*
+ * Normal entry, copy it into shortform.
+ */
+ else {
+ sfep->namelen = dep->namelen;
+ xfs_dir2_sf_put_offset(sfep, offset);
+ memcpy(sfep->name, dep->name, dep->namelen);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ be64_to_cpu(dep->inumber));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_data_get_ftype(mp, dep));
+
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ }
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+ }
+ ASSERT((char *)sfep - (char *)sfp == size);
+
+ /* now we are done with the block, we can shrink the inode */
+ logflags = XFS_ILOG_CORE;
+ error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+ if (error) {
+ ASSERT(error != -ENOSPC);
+ goto out;
+ }
+
+ /*
+ * The buffer is now unconditionally gone, whether
+ * xfs_dir2_shrink_inode worked or not.
+ *
+ * Convert the inode to local format and copy the data in.
+ */
+ ASSERT(dp->i_df.if_bytes == 0);
+ xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size);
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ dp->i_disk_size = size;
+
+ logflags |= XFS_ILOG_DDATA;
+ xfs_dir2_sf_check(args);
+out:
+ xfs_trans_log_inode(args->trans, dp, logflags);
+ kmem_free(sfp);
+ return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int /* error */
+xfs_dir2_sf_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int incr_isize; /* total change in size */
+ int new_isize; /* size after adding name */
+ int objchange; /* changing to 8-byte inodes */
+ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
+ int pick; /* which algorithm to use */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
+
+ trace_xfs_dir2_sf_addname(args);
+
+ ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
+ dp = args->dp;
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ /*
+ * Compute entry (and change in) size.
+ */
+ incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen);
+ objchange = 0;
+
+ /*
+ * Do we have to change to 8 byte inodes?
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+ /*
+ * Yes, adjust the inode size. old count + (parent + new)
+ */
+ incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
+ objchange = 1;
+ }
+
+ new_isize = (int)dp->i_disk_size + incr_isize;
+ /*
+ * Won't fit as shortform any more (due to size),
+ * or the pick routine says it won't (due to offset values).
+ */
+ if (new_isize > xfs_inode_data_fork_size(dp) ||
+ (pick =
+ xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+ /*
+ * Just checking or no space reservation, it doesn't fit.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+ return -ENOSPC;
+ /*
+ * Convert to block form then add the name.
+ */
+ error = xfs_dir2_sf_to_block(args);
+ if (error)
+ return error;
+ return xfs_dir2_block_addname(args);
+ }
+ /*
+ * Just checking, it fits.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ return 0;
+ /*
+ * Do it the easy way - just add it at the end.
+ */
+ if (pick == 1)
+ xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+ /*
+ * Do it the hard way - look for a place to insert the new entry.
+ * Convert to 8 byte inode numbers first if necessary.
+ */
+ else {
+ ASSERT(pick == 2);
+ if (objchange)
+ xfs_dir2_sf_toino8(args);
+ xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+ }
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */
+ xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
+ int new_isize) /* new directory size */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int byteoff; /* byte offset in sf dir */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ byteoff = (int)((char *)sfep - (char *)sfp);
+ /*
+ * Grow the in-inode space.
+ */
+ xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+ XFS_DATA_FORK);
+ /*
+ * Need to set up again due to realloc of the inode data.
+ */
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+ /*
+ * Fill in the new entry.
+ */
+ sfep->namelen = args->namelen;
+ xfs_dir2_sf_put_offset(sfep, offset);
+ memcpy(sfep->name, args->name, sfep->namelen);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+
+ /*
+ * Update the header and inode.
+ */
+ sfp->count++;
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+ sfp->i8count++;
+ dp->i_disk_size = new_isize;
+ xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+ xfs_da_args_t *args, /* operation arguments */
+ int objchange, /* changing inode number size */
+ int new_isize) /* new directory size */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int add_datasize; /* data size need for new ent */
+ char *buf; /* buffer for old */
+ int eof; /* reached end of old dir */
+ int nbytes; /* temp for byte copies */
+ xfs_dir2_data_aoff_t new_offset; /* next offset value */
+ xfs_dir2_data_aoff_t offset; /* current offset value */
+ int old_isize; /* previous size */
+ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
+ xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
+ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
+ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
+
+ /*
+ * Copy the old directory to the stack buffer.
+ */
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ old_isize = (int)dp->i_disk_size;
+ buf = kmem_alloc(old_isize, 0);
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+ memcpy(oldsfp, sfp, old_isize);
+ /*
+ * Loop over the old directory finding the place we're going
+ * to insert the new entry.
+ * If it's going to end up at the end then oldsfep will point there.
+ */
+ for (offset = args->geo->data_first_offset,
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+ add_datasize = xfs_dir2_data_entsize(mp, args->namelen),
+ eof = (char *)oldsfep == &buf[old_isize];
+ !eof;
+ offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep),
+ eof = (char *)oldsfep == &buf[old_isize]) {
+ new_offset = xfs_dir2_sf_get_offset(oldsfep);
+ if (offset + add_datasize <= new_offset)
+ break;
+ }
+ /*
+ * Get rid of the old directory, then allocate space for
+ * the new one. We do this so xfs_idata_realloc won't copy
+ * the data.
+ */
+ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+ /*
+ * Reset the pointer since the buffer was reallocated.
+ */
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Copy the first part of the directory, including the header.
+ */
+ nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+ memcpy(sfp, oldsfp, nbytes);
+ sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+ /*
+ * Fill in the new entry, and update the header counts.
+ */
+ sfep->namelen = args->namelen;
+ xfs_dir2_sf_put_offset(sfep, offset);
+ memcpy(sfep->name, args->name, sfep->namelen);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+ sfp->count++;
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+ sfp->i8count++;
+ /*
+ * If there's more left to copy, do that.
+ */
+ if (!eof) {
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ memcpy(sfep, oldsfep, old_isize - nbytes);
+ }
+ kmem_free(buf);
+ dp->i_disk_size = new_isize;
+ xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int /* pick result */
+xfs_dir2_sf_addname_pick(
+ xfs_da_args_t *args, /* operation arguments */
+ int objchange, /* inode # size changes */
+ xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
+ xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int holefit; /* found hole it will fit in */
+ int i; /* entry number */
+ xfs_dir2_data_aoff_t offset; /* data block offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ int size; /* entry's data size */
+ int used; /* data bytes used */
+
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ size = xfs_dir2_data_entsize(mp, args->namelen);
+ offset = args->geo->data_first_offset;
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ holefit = 0;
+ /*
+ * Loop over sf entries.
+ * Keep track of data offset and whether we've seen a place
+ * to insert the new entry.
+ */
+ for (i = 0; i < sfp->count; i++) {
+ if (!holefit)
+ holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+ offset = xfs_dir2_sf_get_offset(sfep) +
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ }
+ /*
+ * Calculate data bytes used excluding the new entry, if this
+ * was a data block (block form directory).
+ */
+ used = offset +
+ (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t);
+ /*
+ * If it won't fit in a block form then we can't insert it,
+ * we'll go back, convert to block, then try the insert and convert
+ * to leaf.
+ */
+ if (used + (holefit ? 0 : size) > args->geo->blksize)
+ return 0;
+ /*
+ * If changing the inode number size, do it the hard way.
+ */
+ if (objchange)
+ return 2;
+ /*
+ * If it won't fit at the end then do it the hard way (use the hole).
+ */
+ if (used + size > args->geo->blksize)
+ return 2;
+ /*
+ * Do it the easy way.
+ */
+ *sfepp = sfep;
+ *offsetp = offset;
+ return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int i; /* entry number */
+ int i8count; /* number of big inode#s */
+ xfs_ino_t ino; /* entry inode number */
+ int offset; /* data offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ offset = args->geo->data_first_offset;
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
+ i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+ i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+ ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+ offset =
+ xfs_dir2_sf_get_offset(sfep) +
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX);
+ }
+ ASSERT(i8count == sfp->i8count);
+ ASSERT((char *)sfep - (char *)sfp == dp->i_disk_size);
+ ASSERT(offset +
+ (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif /* DEBUG */
+
+/* Verify the consistency of an inline directory. */
+xfs_failaddr_t
+xfs_dir2_sf_verify(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_dir2_sf_hdr *sfp;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_entry *next_sfep;
+ char *endp;
+ xfs_ino_t ino;
+ int i;
+ int i8count;
+ int offset;
+ int64_t size;
+ int error;
+ uint8_t filetype;
+
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
+ size = ifp->if_bytes;
+
+ /*
+ * Give up if the directory is way too short.
+ */
+ if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) ||
+ size < xfs_dir2_sf_hdr_size(sfp->i8count))
+ return __this_address;
+
+ endp = (char *)sfp + size;
+
+ /* Check .. entry */
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
+ i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+ error = xfs_dir_ino_validate(mp, ino);
+ if (error)
+ return __this_address;
+ offset = mp->m_dir_geo->data_first_offset;
+
+ /* Check all reported entries */
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ for (i = 0; i < sfp->count; i++) {
+ /*
+ * struct xfs_dir2_sf_entry has a variable length.
+ * Check the fixed-offset parts of the structure are
+ * within the data buffer.
+ */
+ if (((char *)sfep + sizeof(*sfep)) >= endp)
+ return __this_address;
+
+ /* Don't allow names with known bad length. */
+ if (sfep->namelen == 0)
+ return __this_address;
+
+ /*
+ * Check that the variable-length part of the structure is
+ * within the data buffer. The next entry starts after the
+ * name component, so nextentry is an acceptable test.
+ */
+ next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ if (endp < (char *)next_sfep)
+ return __this_address;
+
+ /* Check that the offsets always increase. */
+ if (xfs_dir2_sf_get_offset(sfep) < offset)
+ return __this_address;
+
+ /* Check the inode number. */
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+ error = xfs_dir_ino_validate(mp, ino);
+ if (error)
+ return __this_address;
+
+ /* Check the file type. */
+ filetype = xfs_dir2_sf_get_ftype(mp, sfep);
+ if (filetype >= XFS_DIR3_FT_MAX)
+ return __this_address;
+
+ offset = xfs_dir2_sf_get_offset(sfep) +
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+
+ sfep = next_sfep;
+ }
+ if (i8count != sfp->i8count)
+ return __this_address;
+ if ((void *)sfep != (void *)endp)
+ return __this_address;
+
+ /* Make sure this whole thing ought to be in local format. */
+ if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize)
+ return __this_address;
+
+ return NULL;
+}
+
+/*
+ * Create a new (shortform) directory.
+ */
+int /* error, always 0 */
+xfs_dir2_sf_create(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_ino_t pino) /* parent inode number */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int i8count; /* parent inode is an 8-byte number */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ int size; /* directory size */
+
+ trace_xfs_dir2_sf_create(args);
+
+ dp = args->dp;
+
+ ASSERT(dp != NULL);
+ ASSERT(dp->i_disk_size == 0);
+ /*
+ * If it's currently a zero-length extent file,
+ * convert it to local format.
+ */
+ if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) {
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ }
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_df.if_bytes == 0);
+ i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+ size = xfs_dir2_sf_hdr_size(i8count);
+ /*
+ * Make a buffer for the data.
+ */
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+ /*
+ * Fill in the header,
+ */
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp->i8count = i8count;
+ /*
+ * Now can put in the inode number, since i8count is set.
+ */
+ xfs_dir2_sf_put_parent_ino(sfp, pino);
+ sfp->count = 0;
+ dp->i_disk_size = size;
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int /* error */
+xfs_dir2_sf_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int i; /* entry index */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ enum xfs_dacmp cmp; /* comparison result */
+ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
+
+ trace_xfs_dir2_sf_lookup(args);
+
+ xfs_dir2_sf_check(args);
+
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ /*
+ * Special case for .
+ */
+ if (args->namelen == 1 && args->name[0] == '.') {
+ args->inumber = dp->i_ino;
+ args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
+ return -EEXIST;
+ }
+ /*
+ * Special case for ..
+ */
+ if (args->namelen == 2 &&
+ args->name[0] == '.' && args->name[1] == '.') {
+ args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
+ args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
+ return -EEXIST;
+ }
+ /*
+ * Loop over all the entries trying to match ours.
+ */
+ ci_sfep = NULL;
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+ /*
+ * Compare name and if it's an exact match, return the inode
+ * number. If it's the first case-insensitive match, store the
+ * inode number and continue looking for an exact match.
+ */
+ cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ args->filetype = xfs_dir2_sf_get_ftype(mp, sfep);
+ if (cmp == XFS_CMP_EXACT)
+ return -EEXIST;
+ ci_sfep = sfep;
+ }
+ }
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was not found, return -ENOENT.
+ */
+ if (!ci_sfep)
+ return -ENOENT;
+ /* otherwise process the CI match as required by the caller */
+ return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int /* error */
+xfs_dir2_sf_removename(
+ xfs_da_args_t *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int byteoff; /* offset of removed entry */
+ int entsize; /* this entry's size */
+ int i; /* shortform entry index */
+ int newsize; /* new inode size */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_removename(args);
+
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ oldsize = (int)dp->i_disk_size;
+ ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == oldsize);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ /*
+ * Loop over the old directory entries.
+ * Find the one we're deleting.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
+ ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) ==
+ args->inumber);
+ break;
+ }
+ }
+ /*
+ * Didn't find it.
+ */
+ if (i == sfp->count)
+ return -ENOENT;
+ /*
+ * Calculate sizes.
+ */
+ byteoff = (int)((char *)sfep - (char *)sfp);
+ entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen);
+ newsize = oldsize - entsize;
+ /*
+ * Copy the part if any after the removed entry, sliding it down.
+ */
+ if (byteoff + entsize < oldsize)
+ memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+ oldsize - (byteoff + entsize));
+ /*
+ * Fix up the header and file size.
+ */
+ sfp->count--;
+ dp->i_disk_size = newsize;
+ /*
+ * Reallocate, making it smaller.
+ */
+ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Are we changing inode number size?
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+ if (sfp->i8count == 1)
+ xfs_dir2_sf_toino4(args);
+ else
+ sfp->i8count--;
+ }
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Check whether the sf dir replace operation need more blocks.
+ */
+static bool
+xfs_dir2_sf_replace_needblock(
+ struct xfs_inode *dp,
+ xfs_ino_t inum)
+{
+ int newsize;
+ struct xfs_dir2_sf_hdr *sfp;
+
+ if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
+ return false;
+
+ sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
+
+ return inum > XFS_DIR2_MAX_SHORT_INUM &&
+ sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int /* error */
+xfs_dir2_sf_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int i; /* entry index */
+ xfs_ino_t ino=0; /* entry old inode number */
+ int i8elevated; /* sf_toino8 set i8count=1 */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_replace(args);
+
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+ /*
+ * New inode number is large, and need to convert to 8-byte inodes.
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+ int error; /* error return value */
+
+ /*
+ * Won't fit as shortform, convert to block then do replace.
+ */
+ if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) {
+ error = xfs_dir2_sf_to_block(args);
+ if (error)
+ return error;
+ return xfs_dir2_block_replace(args);
+ }
+ /*
+ * Still fits, convert to 8-byte now.
+ */
+ xfs_dir2_sf_toino8(args);
+ i8elevated = 1;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ } else
+ i8elevated = 0;
+
+ ASSERT(args->namelen != 1 || args->name[0] != '.');
+ /*
+ * Replace ..'s entry.
+ */
+ if (args->namelen == 2 &&
+ args->name[0] == '.' && args->name[1] == '.') {
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
+ ASSERT(args->inumber != ino);
+ xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
+ }
+ /*
+ * Normal entry, look for the name.
+ */
+ else {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ ASSERT(args->inumber != ino);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
+ break;
+ }
+ }
+ /*
+ * Didn't find it.
+ */
+ if (i == sfp->count) {
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ if (i8elevated)
+ xfs_dir2_sf_toino4(args);
+ return -ENOENT;
+ }
+ }
+ /*
+ * See if the old number was large, the new number is small.
+ */
+ if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+ args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+ /*
+ * And the old count was one, so need to convert to small.
+ */
+ if (sfp->i8count == 1)
+ xfs_dir2_sf_toino4(args);
+ else
+ sfp->i8count--;
+ }
+ /*
+ * See if the old number was small, the new number is large.
+ */
+ if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+ args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+ /*
+ * add to the i8count unless we just converted to 8-byte
+ * inodes (which does an implied i8count = 1)
+ */
+ ASSERT(sfp->i8count != 0);
+ if (!i8elevated)
+ sfp->i8count++;
+ }
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ char *buf; /* old dir's buffer */
+ int i; /* entry index */
+ int newsize; /* new inode size */
+ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* new sf entry */
+ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+
+ trace_xfs_dir2_sf_toino4(args);
+
+ /*
+ * Copy the old directory to the buffer.
+ * Then nuke it from the inode, and add the new buffer to the inode.
+ * Don't want xfs_idata_realloc copying the data here.
+ */
+ oldsize = dp->i_df.if_bytes;
+ buf = kmem_alloc(oldsize, 0);
+ oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->i8count == 1);
+ memcpy(buf, oldsfp, oldsize);
+ /*
+ * Compute the new inode size.
+ */
+ newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
+ xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+ /*
+ * Reset our pointers, the data has moved.
+ */
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Fill in the new header.
+ */
+ sfp->count = oldsfp->count;
+ sfp->i8count = 0;
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ /*
+ * Copy the entries field by field.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
+ sfep->namelen = oldsfep->namelen;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
+ memcpy(sfep->name, oldsfep->name, sfep->namelen);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
+ }
+ /*
+ * Clean up the inode.
+ */
+ kmem_free(buf);
+ dp->i_disk_size = newsize;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ char *buf; /* old dir's buffer */
+ int i; /* entry index */
+ int newsize; /* new inode size */
+ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* new sf entry */
+ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+
+ trace_xfs_dir2_sf_toino8(args);
+
+ /*
+ * Copy the old directory to the buffer.
+ * Then nuke it from the inode, and add the new buffer to the inode.
+ * Don't want xfs_idata_realloc copying the data here.
+ */
+ oldsize = dp->i_df.if_bytes;
+ buf = kmem_alloc(oldsize, 0);
+ oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->i8count == 0);
+ memcpy(buf, oldsfp, oldsize);
+ /*
+ * Compute the new inode size (nb: entry count + 1 for parent)
+ */
+ newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
+ xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+ /*
+ * Reset our pointers, the data has moved.
+ */
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Fill in the new header.
+ */
+ sfp->count = oldsfp->count;
+ sfp->i8count = 1;
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ /*
+ * Copy the entries field by field.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->count;
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
+ sfep->namelen = oldsfep->namelen;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
+ memcpy(sfep->name, oldsfep->name, sfep->namelen);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
+ }
+ /*
+ * Clean up the inode.
+ */
+ kmem_free(buf);
+ dp->i_disk_size = newsize;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000..15a362e2f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+
+int
+xfs_calc_dquots_per_chunk(
+ unsigned int nbblks) /* basic block units */
+{
+ ASSERT(nbblks > 0);
+ return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ *
+ * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure;
+ * we verify them separately because at some points we have only the
+ * smaller xfs_disk_dquot structure available.
+ */
+
+xfs_failaddr_t
+xfs_dquot_verify(
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *ddq,
+ xfs_dqid_t id) /* used only during quotacheck */
+{
+ __u8 ddq_type;
+
+ /*
+ * We can encounter an uninitialized dquot buffer for 2 reasons:
+ * 1. If we crash while deleting the quotainode(s), and those blks got
+ * used for user data. This is because we take the path of regular
+ * file deletion; however, the size field of quotainodes is never
+ * updated, so all the tricks that we play in itruncate_finish
+ * don't quite matter.
+ *
+ * 2. We don't play the quota buffers when there's a quotaoff logitem.
+ * But the allocation will be replayed so we'll end up with an
+ * uninitialized quota block.
+ *
+ * This is all fine; things are still consistent, and we haven't lost
+ * any quota information. Just don't complain about bad dquot blks.
+ */
+ if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC))
+ return __this_address;
+ if (ddq->d_version != XFS_DQUOT_VERSION)
+ return __this_address;
+
+ if (ddq->d_type & ~XFS_DQTYPE_ANY)
+ return __this_address;
+ ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK;
+ if (ddq_type != XFS_DQTYPE_USER &&
+ ddq_type != XFS_DQTYPE_PROJ &&
+ ddq_type != XFS_DQTYPE_GROUP)
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+ !xfs_has_bigtime(mp))
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
+ return __this_address;
+
+ if (id != -1 && id != be32_to_cpu(ddq->d_id))
+ return __this_address;
+
+ if (!ddq->d_id)
+ return NULL;
+
+ if (ddq->d_blk_softlimit &&
+ be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit) &&
+ !ddq->d_btimer)
+ return __this_address;
+
+ if (ddq->d_ino_softlimit &&
+ be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit) &&
+ !ddq->d_itimer)
+ return __this_address;
+
+ if (ddq->d_rtb_softlimit &&
+ be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit) &&
+ !ddq->d_rtbtimer)
+ return __this_address;
+
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_dqblk_verify(
+ struct xfs_mount *mp,
+ struct xfs_dqblk *dqb,
+ xfs_dqid_t id) /* used only during quotacheck */
+{
+ if (xfs_has_crc(mp) &&
+ !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+
+ return xfs_dquot_verify(mp, &dqb->dd_diskdq, id);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+void
+xfs_dqblk_repair(
+ struct xfs_mount *mp,
+ struct xfs_dqblk *dqb,
+ xfs_dqid_t id,
+ xfs_dqtype_t type)
+{
+ /*
+ * Typically, a repair is only requested by quotacheck.
+ */
+ ASSERT(id != -1);
+ memset(dqb, 0, sizeof(struct xfs_dqblk));
+
+ dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ dqb->dd_diskdq.d_type = type;
+ dqb->dd_diskdq.d_id = cpu_to_be32(id);
+
+ if (xfs_has_crc(mp)) {
+ uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ bool readahead)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ int ndquots;
+ int i;
+
+ if (!xfs_has_crc(mp))
+ return true;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+ for (i = 0; i < ndquots; i++, d++) {
+ if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF)) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSBADCRC, __func__,
+ d, sizeof(*d), __this_address);
+ return false;
+ }
+ }
+ return true;
+}
+
+STATIC xfs_failaddr_t
+xfs_dquot_buf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ bool readahead)
+{
+ struct xfs_dqblk *dqb = bp->b_addr;
+ xfs_failaddr_t fa;
+ xfs_dqid_t id = 0;
+ int ndquots;
+ int i;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < ndquots; i++) {
+ struct xfs_disk_dquot *ddq;
+
+ ddq = &dqb[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ fa = xfs_dqblk_verify(mp, &dqb[i], id + i);
+ if (fa) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+ __func__, &dqb[i],
+ sizeof(struct xfs_dqblk), fa);
+ return fa;
+ }
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dquot_buf_verify_struct(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ return xfs_dquot_buf_verify(mp, bp, false);
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp, false))
+ return;
+ xfs_dquot_buf_verify(mp, bp, false);
+}
+
+/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
+ xfs_dquot_buf_verify(mp, bp, true) != NULL) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ }
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ xfs_dquot_buf_verify(mp, bp, false);
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+ .verify_struct = xfs_dquot_buf_verify_struct,
+};
+
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+ .name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
+ .verify_read = xfs_dquot_buf_readahead_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+ struct xfs_disk_dquot *ddq,
+ __be32 dtimer)
+{
+ uint32_t t = be32_to_cpu(dtimer);
+
+ if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+ return xfs_dq_bigtime_to_unix(t);
+
+ return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+ struct xfs_dquot *dqp,
+ time64_t timer)
+{
+ uint32_t t = timer;
+
+ if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+ t = xfs_dq_unix_to_bigtime(timer);
+
+ return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
new file mode 100644
index 000000000..536290816
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ERRORTAG_H_
+#define __XFS_ERRORTAG_H_
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR 0
+#define XFS_ERRTAG_IFLUSH_1 1
+#define XFS_ERRTAG_IFLUSH_2 2
+#define XFS_ERRTAG_IFLUSH_3 3
+#define XFS_ERRTAG_IFLUSH_4 4
+#define XFS_ERRTAG_IFLUSH_5 5
+#define XFS_ERRTAG_IFLUSH_6 6
+#define XFS_ERRTAG_DA_READ_BUF 7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
+#define XFS_ERRTAG_ALLOC_READ_AGF 10
+#define XFS_ERRTAG_IALLOC_READ_AGI 11
+#define XFS_ERRTAG_ITOBP_INOTOBP 12
+#define XFS_ERRTAG_IUNLINK 13
+#define XFS_ERRTAG_IUNLINK_REMOVE 14
+#define XFS_ERRTAG_DIR_INO_VALIDATE 15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
+#define XFS_ERRTAG_IODONE_IOERR 17
+#define XFS_ERRTAG_STRATREAD_IOERR 18
+#define XFS_ERRTAG_STRATCMPL_IOERR 19
+#define XFS_ERRTAG_DIOWRITE_IOERR 20
+#define XFS_ERRTAG_BMAPIFORMAT 21
+#define XFS_ERRTAG_FREE_EXTENT 22
+#define XFS_ERRTAG_RMAP_FINISH_ONE 23
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
+#define XFS_ERRTAG_BMAP_FINISH_ONE 26
+#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+/*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are silenty dropped and handled as if they failed.
+ * All delalloc blocks in the range of the write (including pre-existing
+ * delalloc blocks!) are tossed as part of the write failure error
+ * handling sequence.
+ */
+#define XFS_ERRTAG_DROP_WRITES 28
+#define XFS_ERRTAG_LOG_BAD_CRC 29
+#define XFS_ERRTAG_LOG_ITEM_PIN 30
+#define XFS_ERRTAG_BUF_LRU_REF 31
+#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
+#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_BUF_IOERROR 35
+#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
+#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
+#define XFS_ERRTAG_AG_RESV_FAIL 38
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT 100
+#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT 1
+#define XFS_RANDOM_RMAP_FINISH_ONE 1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
+#define XFS_RANDOM_BMAP_FINISH_ONE 1
+#define XFS_RANDOM_AG_RESV_CRITICAL 4
+#define XFS_RANDOM_DROP_WRITES 1
+#define XFS_RANDOM_LOG_BAD_CRC 1
+#define XFS_RANDOM_LOG_ITEM_PIN 1
+#define XFS_RANDOM_BUF_LRU_REF 2
+#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
+#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
+#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
+#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
+
+#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
new file mode 100644
index 000000000..371dc0723
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -0,0 +1,1829 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
+#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
+#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
+#define XFS_SB_VERSION_NUMBITS 0x000f
+#define XFS_SB_VERSION_ALLFBITS 0xfff0
+#define XFS_SB_VERSION_ATTRBIT 0x0010
+#define XFS_SB_VERSION_NLINKBIT 0x0020
+#define XFS_SB_VERSION_QUOTABIT 0x0040
+#define XFS_SB_VERSION_ALIGNBIT 0x0080
+#define XFS_SB_VERSION_DALIGNBIT 0x0100
+#define XFS_SB_VERSION_SHAREDBIT 0x0200
+#define XFS_SB_VERSION_LOGV2BIT 0x0400
+#define XFS_SB_VERSION_SECTORBIT 0x0800
+#define XFS_SB_VERSION_EXTFLGBIT 0x1000
+#define XFS_SB_VERSION_DIRV2BIT 0x2000
+#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
+#define XFS_SB_VERSION_MOREBITSBIT 0x8000
+
+/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS \
+ ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+ ~XFS_SB_VERSION_SHAREDBIT)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2. Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
+
+#define XFS_SB_VERSION2_OKBITS \
+ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_FTYPE)
+
+/* Maximum size of the xfs filesystem label, no terminating NULL */
+#define XFSLABEL_MAX 12
+
+/*
+ * Superblock - in core version. Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+ uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ uint32_t sb_blocksize; /* logical block size, bytes */
+ xfs_rfsblock_t sb_dblocks; /* number of data blocks */
+ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
+ xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* user-visible file system unique id */
+ xfs_fsblock_t sb_logstart; /* starting block of log if internal */
+ xfs_ino_t sb_rootino; /* root inode number */
+ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
+ xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
+ xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
+ xfs_agblock_t sb_agblocks; /* size of an allocation group */
+ xfs_agnumber_t sb_agcount; /* number of allocation groups */
+ xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
+ xfs_extlen_t sb_logblocks; /* number of log blocks */
+ uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
+ uint16_t sb_sectsize; /* volume sector size, bytes */
+ uint16_t sb_inodesize; /* inode size, bytes */
+ uint16_t sb_inopblock; /* inodes per block */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
+ uint8_t sb_blocklog; /* log2 of sb_blocksize */
+ uint8_t sb_sectlog; /* log2 of sb_sectsize */
+ uint8_t sb_inodelog; /* log2 of sb_inodesize */
+ uint8_t sb_inopblog; /* log2 of sb_inopblock */
+ uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ uint8_t sb_rextslog; /* log2 of sb_rextents */
+ uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
+ uint8_t sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ uint64_t sb_icount; /* allocated inodes */
+ uint64_t sb_ifree; /* free inodes */
+ uint64_t sb_fdblocks; /* free data blocks */
+ uint64_t sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ xfs_ino_t sb_uquotino; /* user quota inode */
+ xfs_ino_t sb_gquotino; /* group quota inode */
+ uint16_t sb_qflags; /* quota flags */
+ uint8_t sb_flags; /* misc. flags */
+ uint8_t sb_shared_vn; /* shared version number */
+ xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ uint32_t sb_unit; /* stripe or raid unit */
+ uint32_t sb_width; /* stripe or raid width */
+ uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
+ uint8_t sb_logsectlog; /* log2 of the log sector size */
+ uint16_t sb_logsectsize; /* sector size for the log, bytes */
+ uint32_t sb_logsunit; /* stripe unit size for the log */
+ uint32_t sb_features2; /* additional feature bits */
+
+ /*
+ * bad features2 field as a result of failing to pad the sb structure to
+ * 64 bits. Some machines will be using this field for features2 bits.
+ * Easiest just to mark it bad and not use it for anything else.
+ *
+ * This is not kept up to date in memory; it is always overwritten by
+ * the value in sb_features2 when formatting the incore superblock to
+ * the disk buffer.
+ */
+ uint32_t sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ uint32_t sb_features_compat;
+ uint32_t sb_features_ro_compat;
+ uint32_t sb_features_incompat;
+ uint32_t sb_features_log_incompat;
+
+ uint32_t sb_crc; /* superblock crc */
+ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
+
+ xfs_ino_t sb_pquotino; /* project quota inode */
+ xfs_lsn_t sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
+
+ /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
+
+/*
+ * Superblock - on disk version. Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+struct xfs_dsb {
+ __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ __be32 sb_blocksize; /* logical block size, bytes */
+ __be64 sb_dblocks; /* number of data blocks */
+ __be64 sb_rblocks; /* number of realtime blocks */
+ __be64 sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* user-visible file system unique id */
+ __be64 sb_logstart; /* starting block of log if internal */
+ __be64 sb_rootino; /* root inode number */
+ __be64 sb_rbmino; /* bitmap inode for realtime extents */
+ __be64 sb_rsumino; /* summary inode for rt bitmap */
+ __be32 sb_rextsize; /* realtime extent size, blocks */
+ __be32 sb_agblocks; /* size of an allocation group */
+ __be32 sb_agcount; /* number of allocation groups */
+ __be32 sb_rbmblocks; /* number of rt bitmap blocks */
+ __be32 sb_logblocks; /* number of log blocks */
+ __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
+ __be16 sb_sectsize; /* volume sector size, bytes */
+ __be16 sb_inodesize; /* inode size, bytes */
+ __be16 sb_inopblock; /* inodes per block */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
+ __u8 sb_blocklog; /* log2 of sb_blocksize */
+ __u8 sb_sectlog; /* log2 of sb_sectsize */
+ __u8 sb_inodelog; /* log2 of sb_inodesize */
+ __u8 sb_inopblog; /* log2 of sb_inopblock */
+ __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ __u8 sb_rextslog; /* log2 of sb_rextents */
+ __u8 sb_inprogress; /* mkfs is in progress, don't mount */
+ __u8 sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ __be64 sb_icount; /* allocated inodes */
+ __be64 sb_ifree; /* free inodes */
+ __be64 sb_fdblocks; /* free data blocks */
+ __be64 sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ __be64 sb_uquotino; /* user quota inode */
+ __be64 sb_gquotino; /* group quota inode */
+ __be16 sb_qflags; /* quota flags */
+ __u8 sb_flags; /* misc. flags */
+ __u8 sb_shared_vn; /* shared version number */
+ __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ __be32 sb_unit; /* stripe or raid unit */
+ __be32 sb_width; /* stripe or raid width */
+ __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
+ __u8 sb_logsectlog; /* log2 of the log sector size */
+ __be16 sb_logsectsize; /* sector size for the log, bytes */
+ __be32 sb_logsunit; /* stripe unit size for the log */
+ __be32 sb_features2; /* additional feature bits */
+ /*
+ * bad features2 field as a result of failing to pad the sb
+ * structure to 64 bits. Some machines will be using this field
+ * for features2 bits. Easiest just to mark it bad and not use
+ * it for anything else.
+ */
+ __be32 sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __be32 sb_features_compat;
+ __be32 sb_features_ro_compat;
+ __be32 sb_features_incompat;
+ __be32 sb_features_log_incompat;
+
+ __le32 sb_crc; /* superblock crc */
+ __be32 sb_spino_align; /* sparse inode chunk alignment */
+
+ __be64 sb_pquotino; /* project quota inode */
+ __be64 sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
+
+ /* must be padded to 64 bit alignment */
+};
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
+#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN 0
+
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+/*
+ * Detect a mismatched features2 field. Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+ return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+ struct xfs_sb *sbp,
+ uint32_t feature)
+{
+ return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+ XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+ struct xfs_sb *sbp,
+ uint32_t feature)
+{
+ return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
+#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+ (XFS_SB_FEAT_INCOMPAT_FTYPE| \
+ XFS_SB_FEAT_INCOMPAT_SPINODES| \
+ XFS_SB_FEAT_INCOMPAT_META_UUID| \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME| \
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+ struct xfs_sb *sbp,
+ uint32_t feature)
+{
+ return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+ struct xfs_sb *sbp,
+ uint32_t feature)
+{
+ return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+static inline void
+xfs_sb_remove_incompat_log_features(
+ struct xfs_sb *sbp)
+{
+ sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
+}
+
+static inline void
+xfs_sb_add_incompat_log_features(
+ struct xfs_sb *sbp,
+ unsigned int features)
+{
+ sbp->sb_features_log_incompat |= features;
+}
+
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino ||
+ ino == sbp->sb_gquotino ||
+ ino == sbp->sb_pquotino);
+}
+
+#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+
+#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
+ xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
+ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb) \
+ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b) \
+ ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+
+/*
+ * Allocation group header
+ *
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
+#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+
+#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number. Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
+ __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
+ __be32 agf_seqno; /* sequence # starting from 0 */
+ __be32 agf_length; /* size in blocks of a.g. */
+ /*
+ * Freespace and rmap information
+ */
+ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
+ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+
+ __be32 agf_flfirst; /* first freelist block's index */
+ __be32 agf_fllast; /* last freelist block's index */
+ __be32 agf_flcount; /* count of blocks in freelist */
+ __be32 agf_freeblks; /* total free blocks */
+
+ __be32 agf_longest; /* longest free space */
+ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+ uuid_t agf_uuid; /* uuid of filesystem */
+
+ __be32 agf_rmap_blocks; /* rmapbt blocks used */
+ __be32 agf_refcount_blocks; /* refcountbt blocks used */
+
+ __be32 agf_refcount_root; /* refcount tree root block */
+ __be32 agf_refcount_level; /* refcount btree levels */
+
+ /*
+ * reserve some contiguous space for future logged fields before we add
+ * the unlogged fields. This makes the range logging via flags and
+ * structure offsets much simpler.
+ */
+ __be64 agf_spare64[14];
+
+ /* unlogged fields, written during buffer writeback. */
+ __be64 agf_lsn; /* last write sequence */
+ __be32 agf_crc; /* crc of agf sector */
+ __be32 agf_spare2;
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+
+#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
+
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
+#define XFS_AGF_NUM_BITS 18
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
+ { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
+ { XFS_AGF_SEQNO, "SEQNO" }, \
+ { XFS_AGF_LENGTH, "LENGTH" }, \
+ { XFS_AGF_ROOTS, "ROOTS" }, \
+ { XFS_AGF_LEVELS, "LEVELS" }, \
+ { XFS_AGF_FLFIRST, "FLFIRST" }, \
+ { XFS_AGF_FLLAST, "FLLAST" }, \
+ { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
+ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
+ { XFS_AGF_LONGEST, "LONGEST" }, \
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
+ { XFS_AGF_UUID, "UUID" }, \
+ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \
+ { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \
+ { XFS_AGF_SPARE64, "SPARE64" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS 64
+
+typedef struct xfs_agi {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
+ __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
+ __be32 agi_seqno; /* sequence # starting from 0 */
+ __be32 agi_length; /* size in blocks of a.g. */
+ /*
+ * Inode information
+ * Inodes are mapped by interpreting the inode number, so no
+ * mapping data is needed here.
+ */
+ __be32 agi_count; /* count of allocated inodes */
+ __be32 agi_root; /* root of inode btree */
+ __be32 agi_level; /* levels in inode btree */
+ __be32 agi_freecount; /* number of free inodes */
+
+ __be32 agi_newino; /* new inode just allocated */
+ __be32 agi_dirino; /* last directory inode chunk */
+ /*
+ * Hash table of inodes which have been unlinked but are
+ * still being referenced.
+ */
+ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
+ uuid_t agi_uuid; /* uuid of filesystem */
+ __be32 agi_crc; /* crc of agi sector */
+ __be32 agi_pad32;
+ __be64 agi_lsn; /* last write sequence */
+
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
+ __be32 agi_iblocks; /* inobt blocks used */
+ __be32 agi_fblocks; /* finobt blocks used */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+
+#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
+
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_NUM_BITS_R2 14
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr))
+
+struct xfs_agfl {
+ __be32 agfl_magicnum;
+ __be32 agfl_seqno;
+ uuid_t agfl_uuid;
+ __be64 agfl_lsn;
+ __be32 agfl_crc;
+} __attribute__((packed));
+
+#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+ ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+ ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+ ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+ (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define XFS_AG_CHECK_DADDR(mp,d,len) \
+ ((len) == 1 ? \
+ ASSERT((d) == XFS_SB_DADDR || \
+ xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+ ASSERT(xfs_daddr_to_agno(mp, d) == \
+ xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system. Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter. This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
+ __be32 t_sec; /* timestamp seconds */
+ __be32 t_nsec; /* timestamp nanoseconds */
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN ((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul 2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+ return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat. To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct xfs_log_dinode which matches the layout of
+ * this structure, but is kept in native format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
+struct xfs_dinode {
+ __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __be16 di_mode; /* mode and type of file */
+ __u8 di_version; /* inode version */
+ __u8 di_format; /* format of di_c data */
+ __be16 di_onlink; /* old number of links to file */
+ __be32 di_uid; /* owner's user id */
+ __be32 di_gid; /* owner's group id */
+ __be32 di_nlink; /* number of links to file */
+ __be16 di_projid_lo; /* lower part of owner's project id */
+ __be16 di_projid_hi; /* higher part owner's project id */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
+ xfs_timestamp_t di_atime; /* time last accessed */
+ xfs_timestamp_t di_mtime; /* time last modified */
+ xfs_timestamp_t di_ctime; /* time created/inode modified */
+ __be64 di_size; /* number of bytes in file */
+ __be64 di_nblocks; /* # of direct & btree blocks used */
+ __be32 di_extsize; /* basic/minimum extent size for file */
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
+ __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __s8 di_aformat; /* format of attr fork's data */
+ __be32 di_dmevmask; /* DMIG event mask */
+ __be16 di_dmstate; /* DMIG state info */
+ __be16 di_flags; /* random flags, XFS_DIFLAG_... */
+ __be32 di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ __be32 di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ __le32 di_crc; /* CRC of the inode */
+ __be64 di_changecount; /* number of attribute changes */
+ __be64 di_lsn; /* flush sequence */
+ __be64 di_flags2; /* more random flags */
+ __be32 di_cowextsize; /* basic cow extent size for file */
+ __u8 di_pad2[12]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_timestamp_t di_crtime; /* time created */
+ __be64 di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+};
+
+#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * Size of the core inode on disk. Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_dinode);
+ return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ */
+#define XFS_MAXLINK ((1U << 31) - 1U)
+
+/*
+ * Values for di_format
+ *
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
+enum xfs_dinode_fmt {
+ XFS_DINODE_FMT_DEV, /* xfs_dev_t */
+ XFS_DINODE_FMT_LOCAL, /* bulk data */
+ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
+ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
+ XFS_DINODE_FMT_UUID /* added long ago, but never used */
+};
+
+#define XFS_INODE_FORMAT_STR \
+ { XFS_DINODE_FMT_DEV, "dev" }, \
+ { XFS_DINODE_FMT_LOCAL, "local" }, \
+ { XFS_DINODE_FMT_EXTENTS, "extent" }, \
+ { XFS_DINODE_FMT_BTREE, "btree" }, \
+ { XFS_DINODE_FMT_UUID, "uuid" }
+
+/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG 8
+#define XFS_DINODE_MAX_LOG 11
+#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_DINODE_SIZE(mp) \
+ (xfs_has_v3inodes(mp) ? \
+ sizeof(struct xfs_dinode) : \
+ offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+ ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
+#define XFS_DFORK_ASIZE(dip,mp) \
+ ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_DFORK_DSIZE(dip, mp) : \
+ XFS_DFORK_ASIZE(dip, mp))
+
+#define XFS_DFORK_MAXEXT(dip, mp, w) \
+ (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+ ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip) \
+ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w) \
+ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (dip)->di_format : \
+ (dip)->di_aformat)
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+ return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+ *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ */
+#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
+/* Do not use bit 15, di_flags is legacy and unchanging now */
+
+#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#define XFS_DIFLAG_ANY \
+ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+/*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+
+#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+
+#define XFS_DIFLAG2_ANY \
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
+
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
+#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
+#define XFS_INO_AGINO_BITS(mp) ((mp)->m_ino_geo.agino_log)
+#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
+#define XFS_INO_BITS(mp) \
+ XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define XFS_INO_TO_AGNO(mp,i) \
+ ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGINO(mp,i) \
+ ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGBNO(mp,i) \
+ (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+ XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define XFS_INO_TO_OFFSET(mp,i) \
+ ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_INO_TO_FSB(mp,i) \
+ XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define XFS_AGINO_TO_INO(mp,a,i) \
+ (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define XFS_AGINO_TO_OFFSET(mp,i) \
+ ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
+ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define XFS_FSB_TO_INO(mp, b) ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+#define XFS_AGB_TO_AGINO(mp, b) ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+
+#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
+#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
+#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
+
+#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s) \
+ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so) \
+ ((xfs_suminfo_t *)((bp)->b_addr + \
+ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi) \
+ ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w) xfs_lowbit32(w)
+#define XFS_RTHIBIT(w) xfs_highbit32(w)
+
+#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
+#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
+
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
+
+/* bitmask to determine if this is a user/group/project dquot */
+#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
+ XFS_DQTYPE_PROJ | \
+ XFS_DQTYPE_GROUP)
+
+#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \
+ XFS_DQTYPE_BIGTIME)
+
+/*
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set. Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps. The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds. A value of zero means to use the default period.
+ */
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan 1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul 2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514. This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U)
+
+/*
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT (2)
+#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+ /*
+ * Round the expiration timestamp up to the nearest bigtime timestamp
+ * that we can store, to give users the most time to fix problems.
+ */
+ return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+ XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years. These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN ((int64_t)0)
+#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+
+/*
+ * This is the main portion of the on-disk representation of quota information
+ * for a user. We pad this with some more expansion room to construct the on
+ * disk structure.
+ */
+struct xfs_disk_dquot {
+ __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
+ __u8 d_version; /* dquot version */
+ __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */
+ __be32 d_id; /* user,project,group id */
+ __be64 d_blk_hardlimit;/* absolute limit on disk blks */
+ __be64 d_blk_softlimit;/* preferred limit on disk blks */
+ __be64 d_ino_hardlimit;/* maximum # allocated inodes */
+ __be64 d_ino_softlimit;/* preferred inode limit */
+ __be64 d_bcount; /* disk blocks owned by the user */
+ __be64 d_icount; /* inodes owned by the user */
+ __be32 d_itimer; /* zero if within inode limits if not,
+ this is when we refuse service */
+ __be32 d_btimer; /* similar to above; for disk blocks */
+ __be16 d_iwarns; /* warnings issued wrt num inodes */
+ __be16 d_bwarns; /* warnings issued wrt disk blocks */
+ __be32 d_pad0; /* 64 bit align */
+ __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
+ __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
+ __be64 d_rtbcount; /* realtime blocks owned */
+ __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
+ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
+ __be16 d_pad;
+};
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+struct xfs_dqblk {
+ struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
+ char dd_fill[4];/* filling for posterity */
+
+ /*
+ * These two are only present on filesystems with the CRC bits set.
+ */
+ __be32 dd_crc; /* checksum */
+ __be64 dd_lsn; /* last modification in log */
+ uuid_t dd_uuid; /* location information */
+};
+
+#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * This defines the unit of allocation of dquots.
+ *
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ *
+ * However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ *
+ * This is part of the ondisk format because the structure size is not a power
+ * of two, which leaves slack at the end of the disk block.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
+
+struct xfs_dsymlink_hdr {
+ __be32 sl_magic;
+ __be32 sl_offset;
+ __be32 sl_bytes;
+ __be32 sl_crc;
+ uuid_t sl_uuid;
+ __be64 sl_owner;
+ __be64 sl_blkno;
+ __be64 sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+#define XFS_SYMLINK_MAXLEN 1024
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
+ sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno. All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+ __be32 ar_startblock; /* starting block number */
+ __be32 ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+ xfs_agblock_t ar_startblock; /* starting block number */
+ xfs_extlen_t ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
+#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
+
+typedef uint64_t xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+
+#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT \
+ (XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t)))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
+ */
+typedef struct xfs_inobt_rec {
+ __be32 ir_startino; /* starting inode number */
+ union {
+ struct {
+ __be32 ir_freecount; /* count of free inodes */
+ } f;
+ struct {
+ __be16 ir_holemask;/* hole mask for sparse chunks */
+ __u8 ir_count; /* total inode count */
+ __u8 ir_freecount; /* count of free inodes */
+ } sp;
+ } ir_u;
+ __be64 ir_free; /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+ xfs_agino_t ir_startino; /* starting inode number */
+ uint16_t ir_holemask; /* hole mask for sparse chunks */
+ uint8_t ir_count; /* total inode count */
+ uint8_t ir_freecount; /* count of free inodes (set bits) */
+ xfs_inofree_t ir_free; /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+ /* non-zero holemask represents a sparse rec. */
+ return holemask;
+}
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+ __be32 ir_startino; /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * Reverse mapping btree format definitions
+ *
+ * There is a btree for the reverse map per allocation group
+ */
+#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */
+
+/*
+ * Ownership info for an extent. This is used to create reverse-mapping
+ * entries.
+ */
+#define XFS_OWNER_INFO_ATTR_FORK (1 << 0)
+#define XFS_OWNER_INFO_BMBT_BLOCK (1 << 1)
+struct xfs_owner_info {
+ uint64_t oi_owner;
+ xfs_fileoff_t oi_offset;
+ unsigned int oi_flags;
+};
+
+/*
+ * Special owner types.
+ *
+ * Seeing as we only support up to 8EB, we have the upper bit of the owner field
+ * to tell us we have a special owner value. We use these for static metadata
+ * allocated at mkfs/growfs time, as well as for freespace management metadata.
+ */
+#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */
+#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */
+#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */
+#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */
+#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */
+#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */
+
+#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63)))
+
+/*
+ * Data record structure
+ */
+struct xfs_rmap_rec {
+ __be32 rm_startblock; /* extent start block */
+ __be32 rm_blockcount; /* extent length */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+};
+
+/*
+ * rmap btree record
+ * rm_offset:63 is the attribute fork flag
+ * rm_offset:62 is the bmbt block flag
+ * rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
+ * rm_offset:54-60 aren't used and should be zero
+ * rm_offset:0-53 is the block offset within the inode
+ */
+#define XFS_RMAP_OFF_ATTR_FORK ((uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK ((uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN ((uint64_t)1ULL << 61)
+
+#define XFS_RMAP_LEN_MAX ((uint32_t)~0U)
+#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \
+ XFS_RMAP_OFF_BMBT_BLOCK | \
+ XFS_RMAP_OFF_UNWRITTEN)
+#define XFS_RMAP_OFF_MASK ((uint64_t)0x3FFFFFFFFFFFFFULL)
+
+#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK)
+
+#define XFS_RMAP_IS_BMBT_BLOCK(off) (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
+#define XFS_RMAP_IS_ATTR_FORK(off) (!!((off) & XFS_RMAP_OFF_ATTR_FORK))
+#define XFS_RMAP_IS_UNWRITTEN(len) (!!((off) & XFS_RMAP_OFF_UNWRITTEN))
+
+#define RMAPBT_STARTBLOCK_BITLEN 32
+#define RMAPBT_BLOCKCOUNT_BITLEN 32
+#define RMAPBT_OWNER_BITLEN 64
+#define RMAPBT_ATTRFLAG_BITLEN 1
+#define RMAPBT_BMBTFLAG_BITLEN 1
+#define RMAPBT_EXNTFLAG_BITLEN 1
+#define RMAPBT_UNUSED_OFFSET_BITLEN 7
+#define RMAPBT_OFFSET_BITLEN 54
+
+/*
+ * Key structure
+ *
+ * We don't use the length for lookups
+ */
+struct xfs_rmap_key {
+ __be32 rm_startblock; /* extent start block */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+} __attribute__((packed));
+
+/* btree pointer type */
+typedef __be32 xfs_rmap_ptr_t;
+
+#define XFS_RMAP_BLOCK(mp) \
+ (xfs_has_finobt(((mp))) ? \
+ XFS_FIBT_BLOCK(mp) + 1 : \
+ XFS_IBT_BLOCK(mp) + 1)
+
+/*
+ * Reference Count Btree format definitions
+ *
+ */
+#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount). Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1. All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here. Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock. This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COWFLAG (1U << 31)
+#define REFCNTBT_COWFLAG_BITLEN 1
+#define REFCNTBT_AGBLOCK_BITLEN 31
+
+struct xfs_refcount_rec {
+ __be32 rc_startblock; /* starting block number */
+ __be32 rc_blockcount; /* count of blocks */
+ __be32 rc_refcount; /* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+ __be32 rc_startblock; /* starting block number */
+};
+
+#define MAXREFCOUNT ((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * l0:63 is an extent flag (value 1 indicates non-normal).
+ * l0:9-62 are startoff.
+ * l0:0-8 and l1:21-63 are startblock.
+ * l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN 1
+#define BMBT_STARTOFF_BITLEN 54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
+#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
+/*
+ * bmbt records have a file offset (block) field that is 54 bits wide, so this
+ * is the largest xfs_fileoff_t that we ever expect to see.
+ */
+#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
+
+typedef struct xfs_bmbt_rec {
+ __be64 l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS 17
+#define STARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASK \
+ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+ return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+ ASSERT(k < (1 << STARTBLOCKVALBITS));
+ return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+ return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+ __be64 br_startoff; /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below. Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+/* short form block header */
+struct xfs_btree_block_shdr {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+};
+
+struct xfs_btree_block {
+ __be32 bb_magic; /* magic number for block type */
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+ union {
+ struct xfs_btree_block_shdr s;
+ struct xfs_btree_block_lhdr l;
+ } bb_u; /* rest */
+};
+
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_lhdr, bb_blkno))
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_lhdr))
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+/*
+ * On-disk XFS access control list structure.
+ */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
+
+struct xfs_acl {
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_has_crc(mp) \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_SIZE(cnt) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
new file mode 100644
index 000000000..1cfd5bc65
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -0,0 +1,851 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ * Copyright (c) 1995-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+ __u32 d_mem; /* data buffer memory alignment */
+ __u32 d_miniosz; /* min xfer size */
+ __u32 d_maxiosz; /* max xfer size */
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back. The first structure is
+ * updated on return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_count; /* # of entries in array incl. 1st */
+ __s32 bmv_entries; /* # of entries filled in (output) */
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries
+ * are used exactly as in the getbmap structure. The getbmapx structure
+ * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ * is only used for the first structure. It contains input flags
+ * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled
+ * in by the XFS_IOC_GETBMAPX command for each returned structure after
+ * the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_count; /* # of entries in array incl. 1st */
+ __s32 bmv_entries; /* # of entries filled in (output). */
+ __s32 bmv_iflags; /* input flags (1st structure) */
+ __s32 bmv_oflags; /* output flags (after 1st structure)*/
+ __s32 bmv_unused1; /* future use */
+ __s32 bmv_unused2; /* future use */
+};
+#endif
+
+/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */
+#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ 0x2 /* Deprecated */
+#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
+#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */
+#define BMV_IF_VALID \
+ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
+
+/* bmv_oflags values - returned for each non-header segment */
+#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
+#define BMV_OF_LAST 0x4 /* segment is the last in the file */
+#define BMV_OF_SHARED 0x8 /* segment shared with another file */
+
+/* fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start;
+ __s64 l_len; /* len == 0 means until end of file */
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+struct xfs_fsop_geom_v1 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V4
+ */
+struct xfs_fsop_geom_v4 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+ __u32 logsunit; /* log stripe unit, bytes */
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+struct xfs_fsop_geom {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+ __u32 logsunit; /* log stripe unit, bytes */
+ uint32_t sick; /* o: unhealthy fs & rt metadata */
+ uint32_t checked; /* o: checked fs & rt metadata */
+ __u64 reserved[17]; /* reserved space */
+};
+
+#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
+#define XFS_FSOP_GEOM_SICK_UQUOTA (1 << 1) /* user quota */
+#define XFS_FSOP_GEOM_SICK_GQUOTA (1 << 2) /* group quota */
+#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
+#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
+#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+ __u64 freedata; /* free data section blocks */
+ __u64 freertx; /* free rt extents */
+ __u64 freeino; /* free inodes */
+ __u64 allocino; /* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+ __u64 resblks;
+ __u64 resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION 0
+#define XFS_FSOP_GEOM_VERSION_V5 5
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR (1 << 0) /* attributes in use */
+#define XFS_FSOP_GEOM_FLAGS_NLINK (1 << 1) /* 32-bit nlink values */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA (1 << 2) /* quotas enabled */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN (1 << 3) /* inode alignment */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN (1 << 4) /* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED (1 << 5) /* read-only shared */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG (1 << 6) /* special extent flag */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2 (1 << 7) /* directory version 2 */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2 (1 << 8) /* log format version 2 */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR (1 << 9) /* sector sizes >1BB */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2 (1 << 10) /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 (1 << 11) /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI (1 << 12) /* ASCII only CI names */
+ /* -- Do not use -- (1 << 13) SGI parent pointers */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB (1 << 14) /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB (1 << 15) /* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE (1 << 16) /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT (1 << 17) /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
+#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
+
+/*
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
+ */
+#define XFS_MIN_AG_BLOCKS 64
+#define XFS_MIN_LOG_BLOCKS 512ULL
+#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL)
+
+/*
+ * Limits on sb_agblocks/sb_agblklog -- mkfs won't format AGs smaller than
+ * 16MB or larger than 1TB.
+ */
+#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */
+#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */
+#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/* Used for sanity checks on superblock */
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \
+ (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
+
+/*
+ * Output for XFS_IOC_AG_GEOMETRY
+ */
+struct xfs_ag_geometry {
+ uint32_t ag_number; /* i/o: AG number */
+ uint32_t ag_length; /* o: length in blocks */
+ uint32_t ag_freeblks; /* o: free space */
+ uint32_t ag_icount; /* o: inodes allocated */
+ uint32_t ag_ifree; /* o: inodes free */
+ uint32_t ag_sick; /* o: sick things in ag */
+ uint32_t ag_checked; /* o: checked metadata in ag */
+ uint32_t ag_flags; /* i/o: flags for this ag */
+ uint64_t ag_reserved[12];/* o: zero */
+};
+#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
+#define XFS_AG_GEOM_SICK_AGF (1 << 1) /* AGF header */
+#define XFS_AG_GEOM_SICK_AGFL (1 << 2) /* AGFL header */
+#define XFS_AG_GEOM_SICK_AGI (1 << 3) /* AGI header */
+#define XFS_AG_GEOM_SICK_BNOBT (1 << 4) /* free space by block */
+#define XFS_AG_GEOM_SICK_CNTBT (1 << 5) /* free space by length */
+#define XFS_AG_GEOM_SICK_INOBT (1 << 6) /* inode index */
+#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
+#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+ __u64 newblocks; /* new data subvol size, fsblocks */
+ __u32 imaxpct; /* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+ __u32 newblocks; /* new log size, fsblocks */
+ __u32 isint; /* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+ __u64 newblocks; /* new realtime size, fsblocks */
+ __u32 extsize; /* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+ __kernel_long_t tv_sec; /* seconds */
+ __s32 tv_nsec; /* and nanoseconds */
+} xfs_bstime_t;
+
+struct xfs_bstat {
+ __u64 bs_ino; /* inode number */
+ __u16 bs_mode; /* type and mode */
+ __u16 bs_nlink; /* number of links */
+ __u32 bs_uid; /* user id */
+ __u32 bs_gid; /* group id */
+ __u32 bs_rdev; /* device value */
+ __s32 bs_blksize; /* block size */
+ __s64 bs_size; /* file size */
+ xfs_bstime_t bs_atime; /* access time */
+ xfs_bstime_t bs_mtime; /* modify time */
+ xfs_bstime_t bs_ctime; /* inode change time */
+ int64_t bs_blocks; /* number of blocks */
+ __u32 bs_xflags; /* extended flags */
+ __s32 bs_extsize; /* extent size */
+ __s32 bs_extents; /* number of extents */
+ __u32 bs_gen; /* generation count */
+ __u16 bs_projid_lo; /* lower part of project id */
+#define bs_projid bs_projid_lo /* (previously just bs_projid) */
+ __u16 bs_forkoff; /* inode fork offset in bytes */
+ __u16 bs_projid_hi; /* higher part of project id */
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ unsigned char bs_pad[2]; /* pad space, unused */
+ __u32 bs_cowextsize; /* cow extent size */
+ __u32 bs_dmevmask; /* DMIG event mask */
+ __u16 bs_dmstate; /* DMIG state info */
+ __u16 bs_aextents; /* attribute number of extents */
+};
+
+/* New bulkstat structure that reports v5 features and fixes padding issues */
+struct xfs_bulkstat {
+ uint64_t bs_ino; /* inode number */
+ uint64_t bs_size; /* file size */
+
+ uint64_t bs_blocks; /* number of blocks */
+ uint64_t bs_xflags; /* extended flags */
+
+ int64_t bs_atime; /* access time, seconds */
+ int64_t bs_mtime; /* modify time, seconds */
+
+ int64_t bs_ctime; /* inode change time, seconds */
+ int64_t bs_btime; /* creation time, seconds */
+
+ uint32_t bs_gen; /* generation count */
+ uint32_t bs_uid; /* user id */
+ uint32_t bs_gid; /* group id */
+ uint32_t bs_projectid; /* project id */
+
+ uint32_t bs_atime_nsec; /* access time, nanoseconds */
+ uint32_t bs_mtime_nsec; /* modify time, nanoseconds */
+ uint32_t bs_ctime_nsec; /* inode change time, nanoseconds */
+ uint32_t bs_btime_nsec; /* creation time, nanoseconds */
+
+ uint32_t bs_blksize; /* block size */
+ uint32_t bs_rdev; /* device value */
+ uint32_t bs_cowextsize_blks; /* cow extent size hint, blocks */
+ uint32_t bs_extsize_blks; /* extent size hint, blocks */
+
+ uint32_t bs_nlink; /* number of links */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
+ uint32_t bs_aextents; /* attribute number of extents */
+ uint16_t bs_version; /* structure version */
+ uint16_t bs_forkoff; /* inode fork offset in bytes */
+
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ uint16_t bs_mode; /* type and mode */
+ uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
+
+ uint64_t bs_pad[6]; /* zeroed */
+};
+
+#define XFS_BULKSTAT_VERSION_V1 (1)
+#define XFS_BULKSTAT_VERSION_V5 (5)
+
+/* bs_sick flags */
+#define XFS_BS_SICK_INODE (1 << 0) /* inode core */
+#define XFS_BS_SICK_BMBTD (1 << 1) /* data fork */
+#define XFS_BS_SICK_BMBTA (1 << 2) /* attr fork */
+#define XFS_BS_SICK_BMBTC (1 << 3) /* cow fork */
+#define XFS_BS_SICK_DIR (1 << 4) /* directory */
+#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */
+#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline uint32_t
+bstat_get_projid(const struct xfs_bstat *bs)
+{
+ return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+struct xfs_fsop_bulkreq {
+ __u64 __user *lastip; /* last inode # pointer */
+ __s32 icount; /* count of entries in buffer */
+ void __user *ubuffer;/* user buffer for inode desc. */
+ __s32 __user *ocount; /* output count pointer */
+};
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+struct xfs_inogrp {
+ __u64 xi_startino; /* starting inode number */
+ __s32 xi_alloccount; /* # bits set in allocmask */
+ __u64 xi_allocmask; /* mask of allocated inodes */
+};
+
+/* New inumbers structure that reports v5 features and fixes padding issues */
+struct xfs_inumbers {
+ uint64_t xi_startino; /* starting inode number */
+ uint64_t xi_allocmask; /* mask of allocated inodes */
+ uint8_t xi_alloccount; /* # bits set in allocmask */
+ uint8_t xi_version; /* version */
+ uint8_t xi_padding[6]; /* zero */
+};
+
+#define XFS_INUMBERS_VERSION_V1 (1)
+#define XFS_INUMBERS_VERSION_V5 (5)
+
+/* Header for bulk inode requests. */
+struct xfs_bulk_ireq {
+ uint64_t ino; /* I/O: start with this inode */
+ uint32_t flags; /* I/O: operation flags */
+ uint32_t icount; /* I: count of entries in buffer */
+ uint32_t ocount; /* O: count of entries filled out */
+ uint32_t agno; /* I: see comment for IREQ_AGNO */
+ uint64_t reserved[5]; /* must be zero */
+};
+
+/*
+ * Only return results from the specified @agno. If @ino is zero, start
+ * with the first inode of @agno.
+ */
+#define XFS_BULK_IREQ_AGNO (1U << 0)
+
+/*
+ * Return bulkstat information for a single inode, where @ino value is a
+ * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
+ * values below. Not compatible with XFS_BULK_IREQ_AGNO.
+ */
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
+
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
+
+/* Operate on the root directory inode. */
+#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
+
+/*
+ * ioctl structures for v5 bulkstat and inumbers requests
+ */
+struct xfs_bulkstat_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_bulkstat bulkstat[];
+};
+#define XFS_BULKSTAT_REQ_SIZE(nr) (sizeof(struct xfs_bulkstat_req) + \
+ (nr) * sizeof(struct xfs_bulkstat))
+
+struct xfs_inumbers_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_inumbers inumbers[];
+};
+#define XFS_INUMBERS_REQ_SIZE(nr) (sizeof(struct xfs_inumbers_req) + \
+ (nr) * sizeof(struct xfs_inumbers))
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+ __s32 fd;
+ __s32 errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION 1
+struct xfs_fs_eofblocks {
+ __u32 eof_version;
+ __u32 eof_flags;
+ uid_t eof_uid;
+ gid_t eof_gid;
+ prid_t eof_prid;
+ __u32 pad32;
+ __u64 eof_min_file_size;
+ __u64 pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm;
+ * kernel only, not included in
+ * valid mask */
+#define XFS_EOF_FLAGS_VALID \
+ (XFS_EOF_FLAGS_SYNC | \
+ XFS_EOF_FLAGS_UID | \
+ XFS_EOF_FLAGS_GID | \
+ XFS_EOF_FLAGS_PRID | \
+ XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+ __u32 fd; /* fd for FD_TO_HANDLE */
+ void __user *path; /* user pathname */
+ __u32 oflags; /* open flags */
+ void __user *ihandle;/* user supplied handle */
+ __u32 ihandlen; /* user supplied length */
+ void __user *ohandle;/* user buffer for handle */
+ __u32 __user *ohandlen;/* user buffer length */
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+/*
+ * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
+ *
+ * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix.
+ */
+#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */
+#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */
+#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */
+#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */
+
+typedef struct xfs_attrlist_cursor {
+ __u32 opaque[4];
+} xfs_attrlist_cursor_t;
+
+/*
+ * Define how lists of attribute names are returned to userspace from the
+ * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the
+ * beginning of the returned buffer, and a each entry in al_offset contains the
+ * relative offset of an xfs_attrlist_ent containing the actual entry.
+ *
+ * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and
+ * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr.
+ */
+struct xfs_attrlist {
+ __s32 al_count; /* number of entries in attrlist */
+ __s32 al_more; /* T/F: more attrs (do call again) */
+ __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+};
+
+struct xfs_attrlist_ent { /* data from attr_list() */
+ __u32 a_valuelen; /* number bytes in value of attr */
+ char a_name[1]; /* attr name (NULL terminated) */
+};
+
+typedef struct xfs_fsop_attrlist_handlereq {
+ struct xfs_fsop_handlereq hreq; /* handle interface structure */
+ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+ __u32 flags; /* which namespace to use */
+ __u32 buflen; /* length of buffer supplied */
+ void __user *buffer; /* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+ __u32 am_opcode;
+#define ATTR_OP_GET 1 /* return the indicated attr's value */
+#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
+ __s32 am_error;
+ void __user *am_attrname;
+ void __user *am_attrvalue;
+ __u32 am_length;
+ __u32 am_flags; /* XFS_IOC_ATTR_* */
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+ struct xfs_fsop_handlereq hreq; /* handle interface structure */
+ __u32 opcount;/* count of following multiop */
+ struct xfs_attr_multiop __user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+
+typedef struct xfs_fid {
+ __u16 fid_len; /* length of remainder */
+ __u16 fid_pad;
+ __u32 fid_gen; /* generation number */
+ __u64 fid_ino; /* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_handle {
+ union {
+ __s64 align; /* force alignment of ha_fid */
+ xfs_fsid_t _ha_fsid; /* unique file system identifier */
+ } ha_u;
+ xfs_fid_t ha_fid; /* file system specific file ID */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+ int64_t sx_version; /* version */
+#define XFS_SX_VERSION 0
+ int64_t sx_fdtarget; /* fd of target file */
+ int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ struct xfs_bstat sx_stat; /* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+
+/* metadata scrubbing */
+struct xfs_scrub_metadata {
+ __u32 sm_type; /* What to check? */
+ __u32 sm_flags; /* flags; see below. */
+ __u64 sm_ino; /* inode number. */
+ __u32 sm_gen; /* inode generation. */
+ __u32 sm_agno; /* ag number. */
+ __u64 sm_reserved[5]; /* pad to 64 bytes */
+};
+
+/*
+ * Metadata types and flags for scrub operation.
+ */
+
+/* Scrub subcommands. */
+#define XFS_SCRUB_TYPE_PROBE 0 /* presence test ioctl */
+#define XFS_SCRUB_TYPE_SB 1 /* superblock */
+#define XFS_SCRUB_TYPE_AGF 2 /* AG free header */
+#define XFS_SCRUB_TYPE_AGFL 3 /* AG free list */
+#define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */
+#define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */
+#define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */
+#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */
+#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */
+#define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */
+#define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */
+#define XFS_SCRUB_TYPE_INODE 11 /* inode record */
+#define XFS_SCRUB_TYPE_BMBTD 12 /* data fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTA 13 /* attr fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTC 14 /* CoW fork block mapping */
+#define XFS_SCRUB_TYPE_DIR 15 /* directory */
+#define XFS_SCRUB_TYPE_XATTR 16 /* extended attribute */
+#define XFS_SCRUB_TYPE_SYMLINK 17 /* symbolic link */
+#define XFS_SCRUB_TYPE_PARENT 18 /* parent pointers */
+#define XFS_SCRUB_TYPE_RTBITMAP 19 /* realtime bitmap */
+#define XFS_SCRUB_TYPE_RTSUM 20 /* realtime summary */
+#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */
+#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
+#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
+#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
+
+/* Number of scrub subcommands. */
+#define XFS_SCRUB_TYPE_NR 25
+
+/* i: Repair this metadata. */
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
+
+/* o: Metadata object needs repair. */
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
+
+/*
+ * o: Metadata object could be optimized. It's not corrupt, but
+ * we could improve on it somehow.
+ */
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
+
+/* o: Cross-referencing failed. */
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
+
+/* o: Metadata object disagrees with cross-referenced metadata. */
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
+
+/* o: Scan was not complete. */
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
+
+/* o: Metadata object looked funny but isn't corrupt. */
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
+
+/*
+ * o: IFLAG_REPAIR was set but metadata object did not need fixing or
+ * optimization and has therefore not been altered.
+ */
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
+
+#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
+#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
+ XFS_SCRUB_OFLAG_PREEN | \
+ XFS_SCRUB_OFLAG_XFAIL | \
+ XFS_SCRUB_OFLAG_XCORRUPT | \
+ XFS_SCRUB_OFLAG_INCOMPLETE | \
+ XFS_SCRUB_OFLAG_WARNING | \
+ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
+#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+
+/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+/* XFS_IOC_ALLOCSP ------- deprecated 10 */
+/* XFS_IOC_FREESP -------- deprecated 11 */
+#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */
+/* XFS_IOC_FREESP64 ------ deprecated 37 */
+#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
+/* XFS_IOC_FSSETDM ------- deprecated 39 */
+#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr)
+/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
+/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
+#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
+/* XFS_IOC_GETFSMAP ------ hoisted 59 */
+#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
+#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
+/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
+
+#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
+#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
+
+/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */
+#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4)
+#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t)
+#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
+#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
+#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+
+
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization. A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512. Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT 9
+#define BBSIZE (1<<BBSHIFT)
+#define BBMASK (BBSIZE-1)
+#define BTOBB(bytes) (((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs) ((bbs) << BBSHIFT)
+#endif
+
+#endif /* __XFS_FS_H__ */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 000000000..99e796256
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs. Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful. If
+ * problems remain at unmount time, we can also request manual intervention by
+ * logging a notice to run xfs_repair.
+ *
+ * Each health tracking group uses a pair of fields for reporting. The
+ * "checked" field tell us if a given piece of metadata has ever been examined,
+ * and the "sick" field tells us if that piece was found to need repairs.
+ * Therefore we can conclude that for a given sick flag value:
+ *
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked => has not been examined since mount
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+struct xfs_fsop_geom;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
+#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
+#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
+#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB (1 << 0) /* superblock */
+#define XFS_SICK_AG_AGF (1 << 1) /* AGF header */
+#define XFS_SICK_AG_AGFL (1 << 2) /* AGFL header */
+#define XFS_SICK_AG_AGI (1 << 3) /* AGI header */
+#define XFS_SICK_AG_BNOBT (1 << 4) /* free space by block */
+#define XFS_SICK_AG_CNTBT (1 << 5) /* free space by length */
+#define XFS_SICK_AG_INOBT (1 << 6) /* inode index */
+#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
+#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
+#define XFS_SICK_INO_BMBTD (1 << 1) /* data fork */
+#define XFS_SICK_INO_BMBTA (1 << 2) /* attr fork */
+#define XFS_SICK_INO_BMBTC (1 << 3) /* cow fork */
+#define XFS_SICK_INO_DIR (1 << 4) /* directory */
+#define XFS_SICK_INO_XATTR (1 << 5) /* extended attributes */
+#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
+ XFS_SICK_FS_UQUOTA | \
+ XFS_SICK_FS_GQUOTA | \
+ XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
+ XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
+ XFS_SICK_AG_AGF | \
+ XFS_SICK_AG_AGFL | \
+ XFS_SICK_AG_AGI | \
+ XFS_SICK_AG_BNOBT | \
+ XFS_SICK_AG_CNTBT | \
+ XFS_SICK_AG_INOBT | \
+ XFS_SICK_AG_FINOBT | \
+ XFS_SICK_AG_RMAPBT | \
+ XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY (XFS_SICK_INO_CORE | \
+ XFS_SICK_INO_BMBTD | \
+ XFS_SICK_INO_BMBTA | \
+ XFS_SICK_INO_BMBTC | \
+ XFS_SICK_INO_DIR | \
+ XFS_SICK_INO_XATTR | \
+ XFS_SICK_INO_SYMLINK | \
+ XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_health_unmount(struct xfs_mount *mp);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+ return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+ return !xfs_inode_has_sickness(ip, -1U);
+}
+
+void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
+void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+
+#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000..94db50eb7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2969 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ xfs_lookup_t dir, /* <=, >=, == */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_holemask = 0;
+ cur->bc_rec.i.ir_count = 0;
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_inobt_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
+{
+ union xfs_btree_rec rec;
+
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
+ rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+ rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+ rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ }
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+ return xfs_btree_update(cur, &rec);
+}
+
+/* Convert on-disk btree record to incore inobt record. */
+void
+xfs_inobt_btrec_to_irec(
+ struct xfs_mount *mp,
+ const union xfs_btree_rec *rec,
+ struct xfs_inobt_rec_incore *irec)
+{
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ if (xfs_has_sparseinodes(mp)) {
+ irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+ irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+ irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+ } else {
+ /*
+ * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+ * values for full inode chunks.
+ */
+ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+ irec->ir_count = XFS_INODES_PER_CHUNK;
+ irec->ir_freecount =
+ be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
+ }
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *irec,
+ int *stat)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ union xfs_btree_rec *rec;
+ int error;
+ uint64_t realfree;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, irec);
+
+ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+ goto out_bad_rec;
+ if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
+ irec->ir_count > XFS_INODES_PER_CHUNK)
+ goto out_bad_rec;
+ if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
+ goto out_bad_rec;
+
+ /* if there are no holes, return the first available offset */
+ if (!xfs_inobt_issparse(irec->ir_holemask))
+ realfree = irec->ir_free;
+ else
+ realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
+ if (hweight64(realfree) != irec->ir_freecount)
+ goto out_bad_rec;
+
+ return 0;
+
+out_bad_rec:
+ xfs_warn(mp,
+ "%s Inode BTree record corruption in AG %d detected!",
+ cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+ cur->bc_ag.pag->pag_agno);
+ xfs_warn(mp,
+"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
+ irec->ir_startino, irec->ir_count, irec->ir_freecount,
+ irec->ir_free, irec->ir_holemask);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ uint16_t holemask,
+ uint8_t count,
+ int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_holemask = holemask;
+ cur->bc_rec.i.ir_count = count;
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+ XFS_INODES_PER_CHUNK,
+ XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+static int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!xfs_is_shutdown(cur->bc_mp))
+ ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur) 0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ int icount,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int nbufs;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+ xfs_ino_t ino = 0;
+ int error;
+
+ /*
+ * Loop over the new block(s), filling in the inodes. For small block
+ * sizes, manipulate the inodes in buffers which are multiples of the
+ * blocks size.
+ */
+ nbufs = length / M_IGEO(mp)->blocks_per_cluster;
+
+ /*
+ * Figure out what version number to use in the inodes we create. If
+ * the superblock version has caught up to the one that supports the new
+ * inode format, then use the new inode version. Otherwise use the old
+ * version so that old kernels will continue to be able to use the file
+ * system.
+ *
+ * For v3 inodes, we also need to write the inode number into the inode,
+ * so calculate the first inode number of the chunk here as
+ * XFS_AGB_TO_AGINO() only works within a filesystem block, not
+ * across multiple filesystem blocks (such as a cluster) and so cannot
+ * be used in the cluster buffer loop below.
+ *
+ * Further, because we are writing the inode directly into the buffer
+ * and calculating a CRC on the entire inode, we have ot log the entire
+ * inode so that the entire range the CRC covers is present in the log.
+ * That means for v3 inode we log the entire buffer rather than just the
+ * inode cores.
+ */
+ if (xfs_has_v3inodes(mp)) {
+ version = 3;
+ ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
+
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, icount,
+ mp->m_sb.sb_inodesize, length, gen);
+ } else
+ version = 2;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno +
+ (j * M_IGEO(mp)->blocks_per_cluster));
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
+ XBF_UNMAPPED, &fbuf);
+ if (error)
+ return error;
+
+ /* Initialize the inode buffers and log them appropriately. */
+ fbuf->b_ops = &xfs_inode_buf_ops;
+ xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+ for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+ if (version == 3) {
+ free->di_ino = cpu_to_be64(ino);
+ ino++;
+ uuid_copy(&free->di_uuid,
+ &mp->m_sb.sb_meta_uuid);
+ xfs_dinode_calc_crc(mp, free);
+ } else if (tp) {
+ /* just log the inode core */
+ xfs_trans_log_buf(tp, fbuf, ioffset,
+ ioffset + XFS_DINODE_SIZE(mp) - 1);
+ }
+ }
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+ struct xfs_mount *mp,
+ xfs_agino_t *startino,
+ uint16_t *allocmask)
+{
+ xfs_agblock_t agbno;
+ xfs_agblock_t mod;
+ int offset;
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+ mod = agbno % mp->m_sb.sb_inoalignmt;
+ if (!mod)
+ return;
+
+ /* calculate the inode offset and align startino */
+ offset = XFS_AGB_TO_AGINO(mp, mod);
+ *startino -= offset;
+
+ /*
+ * Since startino has been aligned down, left shift allocmask such that
+ * it continues to represent the same physical inodes relative to the
+ * new startino.
+ */
+ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+ struct xfs_inobt_rec_incore *trec, /* tgt record */
+ struct xfs_inobt_rec_incore *srec) /* src record */
+{
+ uint64_t talloc;
+ uint64_t salloc;
+
+ /* records must cover the same inode range */
+ if (trec->ir_startino != srec->ir_startino)
+ return false;
+
+ /* both records must be sparse */
+ if (!xfs_inobt_issparse(trec->ir_holemask) ||
+ !xfs_inobt_issparse(srec->ir_holemask))
+ return false;
+
+ /* both records must track some inodes */
+ if (!trec->ir_count || !srec->ir_count)
+ return false;
+
+ /* can't exceed capacity of a full record */
+ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+ return false;
+
+ /* verify there is no allocation overlap */
+ talloc = xfs_inobt_irec_to_allocmask(trec);
+ salloc = xfs_inobt_irec_to_allocmask(srec);
+ if (talloc & salloc)
+ return false;
+
+ return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+ struct xfs_inobt_rec_incore *trec, /* target */
+ struct xfs_inobt_rec_incore *srec) /* src */
+{
+ ASSERT(trec->ir_startino == srec->ir_startino);
+
+ /* combine the counts */
+ trec->ir_count += srec->ir_count;
+ trec->ir_freecount += srec->ir_freecount;
+
+ /*
+ * Merge the holemask and free mask. For both fields, 0 bits refer to
+ * allocated inodes. We combine the allocated ranges with bitwise AND.
+ */
+ trec->ir_holemask &= srec->ir_holemask;
+ trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ int btnum,
+ struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
+ bool merge) /* merge or replace */
+{
+ struct xfs_btree_cur *cur;
+ int error;
+ int i;
+ struct xfs_inobt_rec_incore rec;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ goto out;
+ }
+
+ /*
+ * A record exists at this startino. Merge or replace the record
+ * depending on what we've been asked to do.
+ */
+ if (merge) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
+
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
+ }
+
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
+ * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
+ * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
+ * inode count threshold, or the usual negative error code for other errors.
+ */
+STATIC int
+xfs_ialloc_ag_alloc(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_agi *agi;
+ struct xfs_alloc_arg args;
+ int error;
+ xfs_agino_t newino; /* new first inode's number */
+ xfs_agino_t newlen; /* new number of inodes */
+ int isaligned = 0; /* inode allocation at stripe */
+ /* unit boundary */
+ /* init. to full chunk */
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
+ uint16_t allocmask = (uint16_t) -1;
+ int do_sparse = 0;
+
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = tp->t_mountp;
+ args.fsbno = NULLFSBLOCK;
+ args.oinfo = XFS_RMAP_OINFO_INODES;
+
+#ifdef DEBUG
+ /* randomly do sparse inode allocations */
+ if (xfs_has_sparseinodes(tp->t_mountp) &&
+ igeo->ialloc_min_blks < igeo->ialloc_blks)
+ do_sparse = prandom_u32_max(2);
+#endif
+
+ /*
+ * Locking will ensure that we don't have two callers in here
+ * at one time.
+ */
+ newlen = igeo->ialloc_inos;
+ if (igeo->maxicount &&
+ percpu_counter_read_positive(&args.mp->m_icount) + newlen >
+ igeo->maxicount)
+ return -ENOSPC;
+ args.minlen = args.maxlen = igeo->ialloc_blks;
+ /*
+ * First try to allocate inodes contiguous with the last-allocated
+ * chunk of inodes. If the filesystem is striped, this will fill
+ * an entire stripe unit with inodes.
+ */
+ agi = agbp->b_addr;
+ newino = be32_to_cpu(agi->agi_newino);
+ args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+ igeo->ialloc_blks;
+ if (do_sparse)
+ goto sparse_alloc;
+ if (likely(newino != NULLAGINO &&
+ (args.agbno < be32_to_cpu(agi->agi_length)))) {
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.prod = 1;
+
+ /*
+ * We need to take into account alignment here to ensure that
+ * we don't modify the free list if we fail to have an exact
+ * block. If we don't have an exact match, and every oher
+ * attempt allocation attempt fails, we'll end up cancelling
+ * a dirty transaction and shutting down.
+ *
+ * For an exact allocation, alignment must be 1,
+ * however we need to take cluster alignment into account when
+ * fixing up the freelist. Use the minalignslop field to
+ * indicate that extra blocks might be required for alignment,
+ * but not to use them in the actual exact allocation.
+ */
+ args.alignment = 1;
+ args.minalignslop = igeo->cluster_align - 1;
+
+ /* Allow space for the inode btree to split. */
+ args.minleft = igeo->inobt_maxlevels;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+
+ /*
+ * This request might have dirtied the transaction if the AG can
+ * satisfy the request, but the exact block was not available.
+ * If the allocation did fail, subsequent requests will relax
+ * the exact agbno requirement and increase the alignment
+ * instead. It is critical that the total size of the request
+ * (len + alignment + slop) does not increase from this point
+ * on, so reset minalignslop to ensure it is not included in
+ * subsequent requests.
+ */
+ args.minalignslop = 0;
+ }
+
+ if (unlikely(args.fsbno == NULLFSBLOCK)) {
+ /*
+ * Set the alignment for the allocation.
+ * If stripe alignment is turned on then align at stripe unit
+ * boundary.
+ * If the cluster size is smaller than a filesystem block
+ * then we're doing I/O for inodes in filesystem block size
+ * pieces, so don't need alignment anyway.
+ */
+ isaligned = 0;
+ if (igeo->ialloc_align) {
+ ASSERT(!xfs_has_noalign(args.mp));
+ args.alignment = args.mp->m_dalign;
+ isaligned = 1;
+ } else
+ args.alignment = igeo->cluster_align;
+ /*
+ * Need to figure out where to allocate the inode blocks.
+ * Ideally they should be spaced out through the a.g.
+ * For now, just allocate blocks up front.
+ */
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+ /*
+ * Allocate a fixed-size extent of inodes.
+ */
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.prod = 1;
+ /*
+ * Allow space for the inode btree to split.
+ */
+ args.minleft = igeo->inobt_maxlevels;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+
+ /*
+ * If stripe alignment is turned on, then try again with cluster
+ * alignment.
+ */
+ if (isaligned && args.fsbno == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+ args.alignment = igeo->cluster_align;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+
+ /*
+ * Finally, try a sparse allocation if the filesystem supports it and
+ * the sparse allocation length is smaller than a full chunk.
+ */
+ if (xfs_has_sparseinodes(args.mp) &&
+ igeo->ialloc_min_blks < igeo->ialloc_blks &&
+ args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
+ args.alignment = args.mp->m_sb.sb_spino_align;
+ args.prod = 1;
+
+ args.minlen = igeo->ialloc_min_blks;
+ args.maxlen = args.minlen;
+
+ /*
+ * The inode record will be aligned to full chunk size. We must
+ * prevent sparse allocation from AG boundaries that result in
+ * invalid inode records, such as records that start at agbno 0
+ * or extend beyond the AG.
+ *
+ * Set min agbno to the first aligned, non-zero agbno and max to
+ * the last aligned agbno that is at least one full chunk from
+ * the end of the AG.
+ */
+ args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+ args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.mp->m_sb.sb_inoalignmt) -
+ igeo->ialloc_blks;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
+ ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+ }
+
+ if (args.fsbno == NULLFSBLOCK)
+ return -EAGAIN;
+
+ ASSERT(args.len == args.minlen);
+
+ /*
+ * Stamp and write the inode buffers.
+ *
+ * Seed the new inode cluster with a random generation number. This
+ * prevents short-term reuse of generation numbers if a chunk is
+ * freed and then immediately reallocated. We use random numbers
+ * rather than a linear progression to prevent the next generation
+ * number from being easily guessable.
+ */
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
+ args.agbno, args.len, get_random_u32());
+
+ if (error)
+ return error;
+ /*
+ * Convert the results.
+ */
+ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
+
+ if (xfs_inobt_issparse(~allocmask)) {
+ /*
+ * We've allocated a sparse chunk. Align the startino and mask.
+ */
+ xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+ rec.ir_startino = newino;
+ rec.ir_holemask = ~allocmask;
+ rec.ir_count = newlen;
+ rec.ir_freecount = newlen;
+ rec.ir_free = XFS_INOBT_ALL_FREE;
+
+ /*
+ * Insert the sparse record into the inobt and allow for a merge
+ * if necessary. If a merge does occur, rec is updated to the
+ * merged record.
+ */
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_INO, &rec, true);
+ if (error == -EFSCORRUPTED) {
+ xfs_alert(args.mp,
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+ XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
+ rec.ir_startino),
+ rec.ir_holemask, rec.ir_count);
+ xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ if (error)
+ return error;
+
+ /*
+ * We can't merge the part we've just allocated as for the inobt
+ * due to finobt semantics. The original record may or may not
+ * exist independent of whether physical inodes exist in this
+ * sparse chunk.
+ *
+ * We must update the finobt record based on the inobt record.
+ * rec contains the fully merged and up to date inobt record
+ * from the previous call. Set merge false to replace any
+ * existing record with this one.
+ */
+ if (xfs_has_finobt(args.mp)) {
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_FINO, &rec, false);
+ if (error)
+ return error;
+ }
+ } else {
+ /* full chunk - insert new records to both btrees */
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_has_finobt(args.mp)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
+ newlen, XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Update AGI counts and newino.
+ */
+ be32_add_cpu(&agi->agi_count, newlen);
+ be32_add_cpu(&agi->agi_freecount, newlen);
+ pag->pagi_freecount += newlen;
+ pag->pagi_count += newlen;
+ agi->agi_newino = cpu_to_be32(newino);
+
+ /*
+ * Log allocation group header fields
+ */
+ xfs_ialloc_log_agi(tp, agbp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+ /*
+ * Modify/log superblock values for inode count and inode free count.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+ return 0;
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
+
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+ struct xfs_inobt_rec_incore *rec)
+{
+ xfs_inofree_t realfree;
+
+ /* if there are no holes, return the first available offset */
+ if (!xfs_inobt_issparse(rec->ir_holemask))
+ return xfs_lowbit64(rec->ir_free);
+
+ realfree = xfs_inobt_irec_to_allocmask(rec);
+ realfree &= rec->ir_free;
+
+ return xfs_lowbit64(realfree);
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agbp->b_addr;
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_btree_cur *cur, *tcur;
+ struct xfs_inobt_rec_incore rec, trec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i, j;
+ int searchdistance = 10;
+
+ ASSERT(pag->pagi_init);
+ ASSERT(pag->pagi_inodeok);
+ ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error0;
+
+ /*
+ * If in the same AG as the parent, try to get near the parent.
+ */
+ if (pagno == pag->pag_agno) {
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ if (rec.ir_freecount > 0) {
+ /*
+ * Found a free inode in the same chunk
+ * as the parent, done.
+ */
+ goto alloc_inode;
+ }
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * Skip to last blocks looked up if same parent inode.
+ */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft);
+ if (error)
+ goto error1;
+
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright);
+ if (error)
+ goto error1;
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (--searchdistance > 0 && (!doneleft || !doneright)) {
+ int useleft; /* using left inode chunk this time */
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
+ }
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ rec = trec;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
+ }
+
+ if (searchdistance <= 0) {
+ /*
+ * Not in range - save last search
+ * location and allocate a new inode
+ */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+
+ } else {
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
+ }
+ }
+
+ /*
+ * In a different AG from the parent.
+ * See if the most recently allocated block has any free.
+ */
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error0;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
+ goto error0;
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
+ }
+ }
+ }
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ }
+
+alloc_inode:
+ offset = xfs_inobt_first_free_inode(&rec);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error0;
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error0;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+ *inop = ino;
+ return 0;
+error1:
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
+
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
+ }
+
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
+ }
+
+ /*
+ * Find the first inode available in the AG.
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rec.ir_free != frec->ir_free ||
+ rec.ir_freecount != frec->ir_freecount))
+ return -EFSCORRUPTED;
+
+ return xfs_inobt_update(cur, &rec);
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+static int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agbp->b_addr;
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_has_finobt(mp))
+ return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (pag->pag_agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_inobt_first_free_inode(&rec);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+static int
+xfs_dialloc_roll(
+ struct xfs_trans **tpp,
+ struct xfs_buf *agibp)
+{
+ struct xfs_trans *tp = *tpp;
+ struct xfs_dquot_acct *dqinfo;
+ int error;
+
+ /*
+ * Hold to on to the agibp across the commit so no other allocation can
+ * come in and take the free inodes we just allocated for our caller.
+ */
+ xfs_trans_bhold(tp, agibp);
+
+ /*
+ * We want the quota changes to be associated with the next transaction,
+ * NOT this one. So, detach the dqinfo from this and attach it to the
+ * next transaction.
+ */
+ dqinfo = tp->t_dqinfo;
+ tp->t_dqinfo = NULL;
+
+ error = xfs_trans_roll(&tp);
+
+ /* Re-attach the quota info that we detached from prev trx. */
+ tp->t_dqinfo = dqinfo;
+
+ /*
+ * Join the buffer even on commit error so that the buffer is released
+ * when the caller cancels the transaction and doesn't have to handle
+ * this error case specially.
+ */
+ xfs_trans_bjoin(tp, agibp);
+ *tpp = tp;
+ return error;
+}
+
+static xfs_agnumber_t
+xfs_ialloc_next_ag(
+ xfs_mount_t *mp)
+{
+ xfs_agnumber_t agno;
+
+ spin_lock(&mp->m_agirotor_lock);
+ agno = mp->m_agirotor;
+ if (++mp->m_agirotor >= mp->m_maxagi)
+ mp->m_agirotor = 0;
+ spin_unlock(&mp->m_agirotor_lock);
+
+ return agno;
+}
+
+static bool
+xfs_dialloc_good_ag(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ umode_t mode,
+ int flags,
+ bool ok_alloc)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_extlen_t ineed;
+ xfs_extlen_t longest = 0;
+ int needspace;
+ int error;
+
+ if (!pag->pagi_inodeok)
+ return false;
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_read_agi(pag, tp, NULL);
+ if (error)
+ return false;
+ }
+
+ if (pag->pagi_freecount)
+ return true;
+ if (!ok_alloc)
+ return false;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_read_agf(pag, tp, flags, NULL);
+ if (error)
+ return false;
+ }
+
+ /*
+ * Check that there is enough free space for the file plus a chunk of
+ * inodes if we need to allocate some. If this is the first pass across
+ * the AGs, take into account the potential space needed for alignment
+ * of inode chunks when checking the longest contiguous free space in
+ * the AG - this prevents us from getting ENOSPC because we have free
+ * space larger than ialloc_blks but alignment constraints prevent us
+ * from using it.
+ *
+ * If we can't find an AG with space for full alignment slack to be
+ * taken into account, we must be near ENOSPC in all AGs. Hence we
+ * don't include alignment for the second pass and so if we fail
+ * allocation due to alignment issues then it is most likely a real
+ * ENOSPC condition.
+ *
+ * XXX(dgc): this calculation is now bogus thanks to the per-ag
+ * reservations that xfs_alloc_fix_freelist() now does via
+ * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
+ * be more than large enough for the check below to succeed, but
+ * xfs_alloc_space_available() will fail because of the non-zero
+ * metadata reservation and hence we won't actually be able to allocate
+ * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
+ * because of this.
+ */
+ ineed = M_IGEO(mp)->ialloc_min_blks;
+ if (flags && ineed > 1)
+ ineed += M_IGEO(mp)->cluster_align;
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+ needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+
+ if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
+ return false;
+ return true;
+}
+
+static int
+xfs_dialloc_try_ag(
+ struct xfs_trans **tpp,
+ struct xfs_perag *pag,
+ xfs_ino_t parent,
+ xfs_ino_t *new_ino,
+ bool ok_alloc)
+{
+ struct xfs_buf *agbp;
+ xfs_ino_t ino;
+ int error;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
+ if (error)
+ return error;
+
+ if (!pag->pagi_freecount) {
+ if (!ok_alloc) {
+ error = -EAGAIN;
+ goto out_release;
+ }
+
+ error = xfs_ialloc_ag_alloc(*tpp, agbp, pag);
+ if (error < 0)
+ goto out_release;
+
+ /*
+ * We successfully allocated space for an inode cluster in this
+ * AG. Roll the transaction so that we can allocate one of the
+ * new inodes.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ error = xfs_dialloc_roll(tpp, agbp);
+ if (error)
+ goto out_release;
+ }
+
+ /* Allocate an inode in the found AG */
+ error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino);
+ if (!error)
+ *new_ino = ino;
+ return error;
+
+out_release:
+ xfs_trans_brelse(*tpp, agbp);
+ return error;
+}
+
+/*
+ * Allocate an on-disk inode.
+ *
+ * Mode is used to tell whether the new inode is a directory and hence where to
+ * locate it. The on-disk inode that is allocated will be returned in @new_ino
+ * on success, otherwise an error will be set to indicate the failure (e.g.
+ * -ENOSPC).
+ */
+int
+xfs_dialloc(
+ struct xfs_trans **tpp,
+ xfs_ino_t parent,
+ umode_t mode,
+ xfs_ino_t *new_ino)
+{
+ struct xfs_mount *mp = (*tpp)->t_mountp;
+ xfs_agnumber_t agno;
+ int error = 0;
+ xfs_agnumber_t start_agno;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ bool ok_alloc = true;
+ int flags;
+ xfs_ino_t ino;
+
+ /*
+ * Directories, symlinks, and regular files frequently allocate at least
+ * one block, so factor that potential expansion when we examine whether
+ * an AG has enough space for file creation.
+ */
+ if (S_ISDIR(mode))
+ start_agno = xfs_ialloc_next_ag(mp);
+ else {
+ start_agno = XFS_INO_TO_AGNO(mp, parent);
+ if (start_agno >= mp->m_maxagi)
+ start_agno = 0;
+ }
+
+ /*
+ * If we have already hit the ceiling of inode blocks then clear
+ * ok_alloc so we scan all available agi structures for a free
+ * inode.
+ *
+ * Read rough value of mp->m_icount by percpu_counter_read_positive,
+ * which will sacrifice the preciseness but improve the performance.
+ */
+ if (igeo->maxicount &&
+ percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
+ > igeo->maxicount) {
+ ok_alloc = false;
+ }
+
+ /*
+ * Loop until we find an allocation group that either has free inodes
+ * or in which we can allocate some inodes. Iterate through the
+ * allocation groups upward, wrapping at the end.
+ */
+ agno = start_agno;
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
+ for (;;) {
+ pag = xfs_perag_get(mp, agno);
+ if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) {
+ error = xfs_dialloc_try_ag(tpp, pag, parent,
+ &ino, ok_alloc);
+ if (error != -EAGAIN)
+ break;
+ }
+
+ if (xfs_is_shutdown(mp)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ if (++agno == mp->m_maxagi)
+ agno = 0;
+ if (agno == start_agno) {
+ if (!flags) {
+ error = -ENOSPC;
+ break;
+ }
+ flags = 0;
+ }
+ xfs_perag_put(pag);
+ }
+
+ if (!error)
+ *new_ino = ino;
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *rec)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp,
+ rec->ir_startino);
+ int startidx, endidx;
+ int nextbit;
+ xfs_agblock_t agbno;
+ int contigblk;
+ DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+ if (!xfs_inobt_issparse(rec->ir_holemask)) {
+ /* not sparse, calculate extent info directly */
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ M_IGEO(mp)->ialloc_blks,
+ &XFS_RMAP_OINFO_INODES);
+ return;
+ }
+
+ /* holemask is only 16-bits (fits in an unsigned long) */
+ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+ holemask[0] = rec->ir_holemask;
+
+ /*
+ * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+ * holemask and convert the start/end index of each range to an extent.
+ * We start with the start and end index both pointing at the first 0 in
+ * the mask.
+ */
+ startidx = endidx = find_first_zero_bit(holemask,
+ XFS_INOBT_HOLEMASK_BITS);
+ nextbit = startidx + 1;
+ while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+ nextbit);
+ /*
+ * If the next zero bit is contiguous, update the end index of
+ * the current range and continue.
+ */
+ if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+ nextbit == endidx + 1) {
+ endidx = nextbit;
+ goto next;
+ }
+
+ /*
+ * nextbit is not contiguous with the current end index. Convert
+ * the current start/end to an extent and add it to the free
+ * list.
+ */
+ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+ contigblk = ((endidx - startidx + 1) *
+ XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+
+ ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+ ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+ contigblk, &XFS_RMAP_OINFO_INODES);
+
+ /* reset range to current bit and carry on... */
+ startidx = endidx = nextbit;
+
+next:
+ nextbit++;
+ }
+}
+
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ struct xfs_icluster *xic,
+ struct xfs_inobt_rec_incore *orec)
+{
+ struct xfs_agi *agi = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
+
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
+ /*
+ * Initialize the cursor.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error0;
+
+ /*
+ * Look for the entry describing this inode.
+ */
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+ xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
+ /*
+ * Get the offset in the inode chunk.
+ */
+ off = agino - rec.ir_startino;
+ ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+ ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+ /*
+ * Mark the inode free & increment the count.
+ */
+ rec.ir_free |= XFS_INOBT_MASK(off);
+ rec.ir_freecount++;
+
+ /*
+ * When an inode chunk is free, it becomes eligible for removal. Don't
+ * remove the chunk if the block size is large enough for multiple inode
+ * chunks (that might not be free).
+ */
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ struct xfs_perag *pag = agbp->b_pag;
+
+ xic->deleted = true;
+ xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ rec.ir_startino);
+ xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
+
+ /*
+ * Remove the inode cluster from the AGI B+Tree, adjust the
+ * AGI and Superblock inode counts, and mark the disk space
+ * to be freed when the transaction is committed.
+ */
+ ilen = rec.ir_freecount;
+ be32_add_cpu(&agi->agi_count, -ilen);
+ be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+ pag->pagi_freecount -= ilen - 1;
+ pag->pagi_count -= ilen;
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+ if ((error = xfs_btree_delete(cur, &i))) {
+ xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+
+ xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ } else {
+ xic->deleted = false;
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+
+ /*
+ * Change the inode free counts and log the ag/sb changes.
+ */
+ be32_add_cpu(&agi->agi_freecount, 1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount++;
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+ }
+
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error0;
+
+ *orec = rec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+ ibtrec->ir_count,
+ ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ if (XFS_IS_CORRUPT(mp,
+ rec.ir_free != ibtrec->ir_free ||
+ rec.ir_freecount != ibtrec->ir_freecount)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ *
+ * Note that we currently can't free chunks when the block size is large
+ * enough for multiple chunks. Leave the finobt record to remain in sync
+ * with the inobt.
+ */
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_ino_t inode,
+ struct xfs_icluster *xic)
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ int error; /* error return value */
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ /*
+ * Break up inode number into its components.
+ */
+ if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
+ xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
+ __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
+ ASSERT(0);
+ return -EINVAL;
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ ASSERT(0);
+ return -EINVAL;
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return -EINVAL;
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_has_finobt(mp)) {
+ error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_agblock_t *chunk_agbno,
+ xfs_agblock_t *offset_agbno,
+ int flags)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ int error;
+ int i;
+
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ if (error) {
+ xfs_alert(mp,
+ "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+ __func__, error, pag->pag_agno);
+ return error;
+ }
+
+ /*
+ * Lookup the inode record for the given agino. If the record cannot be
+ * found, then it's an invalid inode number and we should abort. Once
+ * we have a record, we need to ensure it contains the inode number
+ * we are looking up.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+ if (!error) {
+ if (i)
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (!error && i == 0)
+ error = -EINVAL;
+ }
+
+ xfs_trans_brelse(tp, agbp);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ return error;
+
+ /* check that the returned record contains the required inode */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
+ return -EINVAL;
+
+ /* for untrusted inodes check it is allocated first */
+ if ((flags & XFS_IGET_UNTRUSTED) &&
+ (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+ return -EINVAL;
+
+ *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+ *offset_agbno = agbno - *chunk_agbno;
+ return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
+{
+ xfs_agblock_t agbno; /* block number of inode in the alloc group */
+ xfs_agino_t agino; /* inode number within alloc group */
+ xfs_agblock_t chunk_agbno; /* first block in inode chunk */
+ xfs_agblock_t cluster_agbno; /* first block in inode cluster */
+ int error; /* error code */
+ int offset; /* index of inode in its buffer */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
+ struct xfs_perag *pag;
+
+ ASSERT(ino != NULLFSINO);
+
+ /*
+ * Split up the inode number into its parts.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+ agino = XFS_INO_TO_AGINO(mp, ino);
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (!pag || agbno >= mp->m_sb.sb_agblocks ||
+ ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ error = -EINVAL;
+#ifdef DEBUG
+ /*
+ * Don't output diagnostic information for untrusted inodes
+ * as they can be invalid without implying corruption.
+ */
+ if (flags & XFS_IGET_UNTRUSTED)
+ goto out_drop;
+ if (!pag) {
+ xfs_alert(mp,
+ "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+ __func__, XFS_INO_TO_AGNO(mp, ino),
+ mp->m_sb.sb_agcount);
+ }
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_alert(mp,
+ "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+ __func__, (unsigned long long)agbno,
+ (unsigned long)mp->m_sb.sb_agblocks);
+ }
+ if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ xfs_alert(mp,
+ "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ __func__, ino,
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ }
+ xfs_stack_trace();
+#endif /* DEBUG */
+ goto out_drop;
+ }
+
+ /*
+ * For bulkstat and handle lookups, we have an untrusted inode number
+ * that we have to verify is valid. We cannot do this just by reading
+ * the inode buffer as it may have been unlinked and removed leaving
+ * inodes in stale state on disk. Hence we have to do a btree lookup
+ * in all cases where an untrusted inode number is passed.
+ */
+ if (flags & XFS_IGET_UNTRUSTED) {
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ goto out_drop;
+ goto out_map;
+ }
+
+ /*
+ * If the inode cluster size is the same as the blocksize or
+ * smaller we get to the buffer by simple arithmetics.
+ */
+ if (M_IGEO(mp)->blocks_per_cluster == 1) {
+ offset = XFS_INO_TO_OFFSET(mp, ino);
+ ASSERT(offset < mp->m_sb.sb_inopblock);
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, 1);
+ imap->im_boffset = (unsigned short)(offset <<
+ mp->m_sb.sb_inodelog);
+ error = 0;
+ goto out_drop;
+ }
+
+ /*
+ * If the inode chunks are aligned then use simple maths to
+ * find the location. Otherwise we have to do a btree
+ * lookup to find the location.
+ */
+ if (M_IGEO(mp)->inoalign_mask) {
+ offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
+ chunk_agbno = agbno - offset_agbno;
+ } else {
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ goto out_drop;
+ }
+
+out_map:
+ ASSERT(agbno >= chunk_agbno);
+ cluster_agbno = chunk_agbno +
+ ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
+ M_IGEO(mp)->blocks_per_cluster);
+ offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+ XFS_INO_TO_OFFSET(mp, ino);
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+ imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_alert(mp,
+ "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+ __func__, (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ error = -EINVAL;
+ goto out_drop;
+ }
+ error = 0;
+out_drop:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
+{
+ int first; /* first byte number */
+ int last; /* last byte number */
+ static const short offsets[] = { /* field starting offsets */
+ /* keep in sync with bit definitions */
+ offsetof(xfs_agi_t, agi_magicnum),
+ offsetof(xfs_agi_t, agi_versionnum),
+ offsetof(xfs_agi_t, agi_seqno),
+ offsetof(xfs_agi_t, agi_length),
+ offsetof(xfs_agi_t, agi_count),
+ offsetof(xfs_agi_t, agi_root),
+ offsetof(xfs_agi_t, agi_level),
+ offsetof(xfs_agi_t, agi_freecount),
+ offsetof(xfs_agi_t, agi_newino),
+ offsetof(xfs_agi_t, agi_dirino),
+ offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
+ offsetof(xfs_agi_t, agi_iblocks),
+ sizeof(xfs_agi_t)
+ };
+#ifdef DEBUG
+ struct xfs_agi *agi = bp->b_addr;
+
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+ /*
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
+ */
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
+ /*
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
+ */
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+}
+
+static xfs_failaddr_t
+xfs_agi_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_agi *agi = bp->b_addr;
+ int i;
+
+ if (xfs_has_crc(mp)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
+ return __this_address;
+ }
+
+ /*
+ * Validate the magic number of the agi block.
+ */
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
+ return __this_address;
+ if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+ return __this_address;
+
+ if (be32_to_cpu(agi->agi_level) < 1 ||
+ be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
+ return __this_address;
+
+ if (xfs_has_finobt(mp) &&
+ (be32_to_cpu(agi->agi_free_level) < 1 ||
+ be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
+ return __this_address;
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+ return __this_address;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
+ continue;
+ if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static void
+xfs_agi_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (xfs_has_crc(mp) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_agi_verify(bp);
+ if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_agi_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agi *agi = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ fa = xfs_agi_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (bip)
+ agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
+ .verify_read = xfs_agi_read_verify,
+ .verify_write = xfs_agi_write_verify,
+ .verify_struct = xfs_agi_verify,
+};
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
+
+ trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ if (error)
+ return error;
+ if (tp)
+ xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
+
+ xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
+ return 0;
+}
+
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
+int
+xfs_ialloc_read_agi(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
+{
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
+ int error;
+
+ trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
+
+ error = xfs_read_agi(pag, tp, &agibp);
+ if (error)
+ return error;
+
+ agi = agibp->b_addr;
+ if (!pag->pagi_init) {
+ pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+ pag->pagi_count = be32_to_cpu(agi->agi_count);
+ pag->pagi_init = 1;
+ }
+
+ /*
+ * It's possible for these to be out of sync if
+ * we are in the middle of a forced shutdown.
+ */
+ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+ xfs_is_shutdown(pag->pag_mount));
+ if (agibpp)
+ *agibpp = agibp;
+ else
+ xfs_trans_brelse(tp, agibp);
+ return 0;
+}
+
+/* Is there an inode record covering a given range of inode numbers? */
+int
+xfs_ialloc_has_inode_record(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t low,
+ xfs_agino_t high,
+ bool *exists)
+{
+ struct xfs_inobt_rec_incore irec;
+ xfs_agino_t agino;
+ uint16_t holemask;
+ int has_record;
+ int i;
+ int error;
+
+ *exists = false;
+ error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
+ while (error == 0 && has_record) {
+ error = xfs_inobt_get_rec(cur, &irec, &has_record);
+ if (error || irec.ir_startino > high)
+ break;
+
+ agino = irec.ir_startino;
+ holemask = irec.ir_holemask;
+ for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
+ i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
+ if (holemask & 1)
+ continue;
+ if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
+ agino <= high) {
+ *exists = true;
+ return 0;
+ }
+ }
+
+ error = xfs_btree_increment(cur, 0, &has_record);
+ }
+ return error;
+}
+
+/* Is there an inode record covering a given extent? */
+int
+xfs_ialloc_has_inodes_at_extent(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool *exists)
+{
+ xfs_agino_t low;
+ xfs_agino_t high;
+
+ low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+ high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
+
+ return xfs_ialloc_has_inode_record(cur, low, high, exists);
+}
+
+struct xfs_ialloc_count_inodes {
+ xfs_agino_t count;
+ xfs_agino_t freecount;
+};
+
+/* Record inode counts across all inobt records. */
+STATIC int
+xfs_ialloc_count_inodes_rec(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xfs_ialloc_count_inodes *ci = priv;
+
+ xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
+ ci->count += irec.ir_count;
+ ci->freecount += irec.ir_freecount;
+
+ return 0;
+}
+
+/* Count allocated and free inodes under an inobt. */
+int
+xfs_ialloc_count_inodes(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t *count,
+ xfs_agino_t *freecount)
+{
+ struct xfs_ialloc_count_inodes ci = {0};
+ int error;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
+ if (error)
+ return error;
+
+ *count = ci.count;
+ *freecount = ci.freecount;
+ return 0;
+}
+
+/*
+ * Initialize inode-related geometry information.
+ *
+ * Compute the inode btree min and max levels and set maxicount.
+ *
+ * Set the inode cluster size. This may still be overridden by the file
+ * system block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to keep a
+ * constant ratio of inode per cluster buffer, but only if mkfs has set the
+ * inode alignment value appropriately for larger cluster sizes.
+ *
+ * Then compute the inode cluster alignment information.
+ */
+void
+xfs_ialloc_setup_geometry(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ uint64_t icount;
+ uint inodes;
+
+ igeo->new_diflags2 = 0;
+ if (xfs_has_bigtime(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
+
+ /* Compute inode btree geometry. */
+ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+ igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
+ igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
+
+ igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
+ sbp->sb_inopblock);
+ igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ igeo->ialloc_min_blks = sbp->sb_spino_align;
+ else
+ igeo->ialloc_min_blks = igeo->ialloc_blks;
+
+ /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
+ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+ igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
+ inodes);
+ ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
+
+ /*
+ * Set the maximum inode count for this filesystem, being careful not
+ * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
+ * users should never get here due to failing sb verification, but
+ * certain users (xfs_db) need to be usable even with corrupt metadata.
+ */
+ if (sbp->sb_imax_pct && igeo->ialloc_blks) {
+ /*
+ * Make sure the maximum inode count is a multiple
+ * of the units we allocate inodes in.
+ */
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ do_div(icount, igeo->ialloc_blks);
+ igeo->maxicount = XFS_FSB_TO_INO(mp,
+ icount * igeo->ialloc_blks);
+ } else {
+ igeo->maxicount = 0;
+ }
+
+ /*
+ * Compute the desired size of an inode cluster buffer size, which
+ * starts at 8K and (on v5 filesystems) scales up with larger inode
+ * sizes.
+ *
+ * Preserve the desired inode cluster size because the sparse inodes
+ * feature uses that desired size (not the actual size) to compute the
+ * sparse inode alignment. The mount code validates this value, so we
+ * cannot change the behavior.
+ */
+ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
+ if (xfs_has_v3inodes(mp)) {
+ int new_size = igeo->inode_cluster_size_raw;
+
+ new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+ if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+ igeo->inode_cluster_size_raw = new_size;
+ }
+
+ /* Calculate inode cluster ratios. */
+ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
+ igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
+ igeo->inode_cluster_size_raw);
+ else
+ igeo->blocks_per_cluster = 1;
+ igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
+ igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
+
+ /* Calculate inode cluster alignment. */
+ if (xfs_has_align(mp) &&
+ mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
+ igeo->cluster_align = mp->m_sb.sb_inoalignmt;
+ else
+ igeo->cluster_align = 1;
+ igeo->inoalign_mask = igeo->cluster_align - 1;
+ igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
+
+ /*
+ * If we are using stripe alignment, check whether
+ * the stripe unit is a multiple of the inode alignment
+ */
+ if (mp->m_dalign && igeo->inoalign_mask &&
+ !(mp->m_dalign & igeo->inoalign_mask))
+ igeo->ialloc_align = mp->m_dalign;
+ else
+ igeo->ialloc_align = 0;
+}
+
+/* Compute the location of the root directory inode that is laid out by mkfs. */
+xfs_ino_t
+xfs_ialloc_calc_rootino(
+ struct xfs_mount *mp,
+ int sunit)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t first_bno;
+
+ /*
+ * Pre-calculate the geometry of AG 0. We know what it looks like
+ * because libxfs knows how to create allocation groups now.
+ *
+ * first_bno is the first block in which mkfs could possibly have
+ * allocated the root directory inode, once we factor in the metadata
+ * that mkfs formats before it. Namely, the four AG headers...
+ */
+ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+ /* ...the two free space btree roots... */
+ first_bno += 2;
+
+ /* ...the inode btree root... */
+ first_bno += 1;
+
+ /* ...the initial AGFL... */
+ first_bno += xfs_alloc_min_freelist(mp, NULL);
+
+ /* ...the free inode btree root... */
+ if (xfs_has_finobt(mp))
+ first_bno++;
+
+ /* ...the reverse mapping btree root... */
+ if (xfs_has_rmapbt(mp))
+ first_bno++;
+
+ /* ...the reference count btree... */
+ if (xfs_has_reflink(mp))
+ first_bno++;
+
+ /*
+ * ...and the log, if it is allocated in the first allocation group.
+ *
+ * This can happen with filesystems that only have a single
+ * allocation group, or very odd geometries created by old mkfs
+ * versions on very small filesystems.
+ */
+ if (xfs_ag_contains_log(mp, 0))
+ first_bno += mp->m_sb.sb_logblocks;
+
+ /*
+ * Now round first_bno up to whatever allocation alignment is given
+ * by the filesystem or was passed in.
+ */
+ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
+ first_bno = roundup(first_bno, sunit);
+ else if (xfs_has_align(mp) &&
+ mp->m_sb.sb_inoalignmt > 1)
+ first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
+
+ return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
+}
+
+/*
+ * Ensure there are not sparse inode clusters that cross the new EOAG.
+ *
+ * This is a no-op for non-spinode filesystems since clusters are always fully
+ * allocated and checking the bnobt suffices. However, a spinode filesystem
+ * could have a record where the upper inodes are free blocks. If those blocks
+ * were removed from the filesystem, the inode record would extend beyond EOAG,
+ * which will be flagged as corruption.
+ */
+int
+xfs_ialloc_check_shrink(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ xfs_agblock_t new_length)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length);
+ int has;
+ int error;
+
+ if (!xfs_has_sparseinodes(mp))
+ return 0;
+
+ pag = xfs_perag_get(mp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO);
+
+ /* Look up the inobt record that would correspond to the new EOFS. */
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
+ if (error || !has)
+ goto out;
+
+ error = xfs_inobt_get_rec(cur, &rec, &has);
+ if (error)
+ goto out;
+
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* If the record covers inodes that would be beyond EOFS, bail out. */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
+ error = -ENOSPC;
+ goto out;
+ }
+out:
+ xfs_btree_del_cursor(cur, error);
+ xfs_perag_put(pag);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644
index 000000000..9bbbca6ac
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_btree_cur;
+
+/* Move inodes in clusters of this size */
+#define XFS_INODE_BIG_CLUSTER_SIZE 8192
+
+struct xfs_icluster {
+ bool deleted; /* record is deleted */
+ xfs_ino_t first_ino; /* first inode number */
+ uint64_t alloc; /* inode phys. allocation bitmap for
+ * sparse chunks */
+};
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+ return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
+}
+
+/*
+ * Allocate an inode on disk. Mode is used to tell whether the new inode will
+ * need space, and whether it is a directory.
+ */
+int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+ xfs_ino_t *new_ino);
+
+int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
+ xfs_ino_t ino, struct xfs_icluster *ifree);
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags); /* flags for inode btree lookup */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *bp, /* allocation group header buffer */
+ uint32_t fields); /* bitmask of fields to log */
+
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
+
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list, int icount,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
+
+
+union xfs_btree_rec;
+void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
+ const union xfs_btree_rec *rec,
+ struct xfs_inobt_rec_incore *irec);
+int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
+ xfs_agino_t high, bool *exists);
+int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
+ xfs_agino_t *freecount);
+int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
+ uint8_t count, int32_t freecount, xfs_inofree_t free,
+ int *stat);
+
+int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
+
+int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf *agibp, xfs_agblock_t new_length);
+
+#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000..8c83e2657
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,833 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache *xfs_inobt_cur_cache;
+
+STATIC int
+xfs_inobt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+}
+
+STATIC void
+xfs_inobt_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ agi->agi_root = nptr->s;
+ be32_add_cpu(&agi->agi_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC void
+xfs_finobt_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ agi->agi_free_root = nptr->s;
+ be32_add_cpu(&agi->agi_free_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp,
+ XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+ struct xfs_btree_cur *cur,
+ int howmuch)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ if (!xfs_has_inobtcounts(cur->bc_mp))
+ return;
+
+ if (cur->bc_btnum == XFS_BTNUM_FINO)
+ be32_add_cpu(&agi->agi_fblocks, howmuch);
+ else if (cur->bc_btnum == XFS_BTNUM_INO)
+ be32_add_cpu(&agi->agi_iblocks, howmuch);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
+STATIC int
+__xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat,
+ enum xfs_ag_resv_type resv)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+ xfs_agblock_t sbno = be32_to_cpu(start->s);
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.oinfo = XFS_RMAP_OINFO_INOBT;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.resv = resv;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (args.fsbno == NULLFSBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+
+ new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+ *stat = 1;
+ xfs_inobt_mod_blockcount(cur, 1);
+ return 0;
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ if (cur->bc_mp->m_finobt_nores)
+ return xfs_inobt_alloc_block(cur, start, new, stat);
+ return __xfs_inobt_alloc_block(cur, start, new, stat,
+ XFS_AG_RESV_METADATA);
+}
+
+STATIC int
+__xfs_inobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ enum xfs_ag_resv_type resv)
+{
+ xfs_inobt_mod_blockcount(cur, -1);
+ return xfs_free_extent(cur->bc_tp,
+ XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
+ &XFS_RMAP_OINFO_INOBT, resv);
+}
+
+STATIC int
+xfs_inobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ if (cur->bc_mp->m_finobt_nores)
+ return xfs_inobt_free_block(cur, bp);
+ return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->inobt.ir_startino);
+ x += XFS_INODES_PER_CHUNK - 1;
+ key->inobt.ir_startino = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
+ rec->inobt.ir_u.sp.ir_holemask =
+ cpu_to_be16(cur->bc_rec.i.ir_holemask);
+ rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+ rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec->inobt.ir_u.f.ir_freecount =
+ cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ }
+ rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
+
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+
+ ptr->s = agi->agi_root;
+}
+
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
+
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+ ptr->s = agi->agi_free_root;
+}
+
+STATIC int64_t
+xfs_inobt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
+ cur->bc_rec.i.ir_startino;
+}
+
+STATIC int64_t
+xfs_inobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
+ be32_to_cpu(k2->inobt.ir_startino);
+}
+
+static xfs_failaddr_t
+xfs_inobt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ /*
+ * During growfs operations, we can't verify the exact owner as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agi information will not yet have been initialised
+ * from the on disk AGI. We don't currently use any of this information,
+ * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+ * ever do.
+ */
+ if (xfs_has_crc(mp)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
+
+ /* level verification */
+ level = be16_to_cpu(block->bb_level);
+ if (level >= M_IGEO(mp)->inobt_maxlevels)
+ return __this_address;
+
+ return xfs_btree_sblock_verify(bp,
+ M_IGEO(mp)->inobt_mxr[level != 0]);
+}
+
+static void
+xfs_inobt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_inobt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_inobt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_inobt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+STATIC int
+xfs_inobt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->inobt.ir_startino) <
+ be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+ be32_to_cpu(r2->inobt.ir_startino);
+}
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_inobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .diff_two_keys = xfs_inobt_diff_two_keys,
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_finobt_set_root,
+ .alloc_block = xfs_finobt_alloc_block,
+ .free_block = xfs_finobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_finobt_buf_ops,
+ .diff_two_keys = xfs_inobt_diff_two_keys,
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+};
+
+/*
+ * Initialize a new inode btree cursor.
+ */
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum) /* ialloc or free ino btree */
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+ if (btnum == XFS_BTNUM_INO) {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+ cur->bc_ops = &xfs_inobt_ops;
+ } else {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+ cur->bc_ops = &xfs_finobt_ops;
+ }
+
+ if (xfs_has_crc(mp))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+ return cur;
+}
+
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur = xfs_inobt_init_common(mp, tp, pag, btnum);
+ if (btnum == XFS_BTNUM_INO)
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ else
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_inobt_init_common(mp, NULL, pag, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new inobt btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agi *agi = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+ int fields;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ if (cur->bc_btnum == XFS_BTNUM_INO) {
+ fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
+ agi->agi_root = cpu_to_be32(afake->af_root);
+ agi->agi_level = cpu_to_be32(afake->af_levels);
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
+ agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ } else {
+ fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
+ agi->agi_free_root = cpu_to_be32(afake->af_root);
+ agi->agi_free_level = cpu_to_be32(afake->af_levels);
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
+ agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ }
+}
+
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+ return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
+
+/*
+ * Maximum number of inode btree records per AG. Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+ XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+ return max(xfs_inobt_maxlevels_ondisk(),
+ xfs_finobt_maxlevels_ondisk());
+}
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+ struct xfs_inobt_rec_incore *rec)
+{
+ uint64_t bitmap = 0;
+ uint64_t inodespbit;
+ int nextbit;
+ uint allocbitmap;
+
+ /*
+ * The holemask has 16-bits for a 64 inode record. Therefore each
+ * holemask bit represents multiple inodes. Create a mask of bits to set
+ * in the allocmask for each holemask bit.
+ */
+ inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+ /*
+ * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+ * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+ * anything beyond the 16 holemask bits since this casts to a larger
+ * type.
+ */
+ allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+ /*
+ * allocbitmap is the inverted holemask so every set bit represents
+ * allocated inodes. To expand from 16-bit holemask granularity to
+ * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+ * bitmap for every holemask bit.
+ */
+ nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+ while (nextbit != -1) {
+ ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+ bitmap |= (inodespbit <<
+ (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+ nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+ }
+
+ return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int inocount = 0;
+ int nextbit = 0;
+ uint64_t allocbmap;
+ int wordsz;
+
+ wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+ allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+ while (nextbit != -1) {
+ inocount++;
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+ nextbit + 1);
+ }
+
+ if (inocount != rec->ir_count)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+#endif /* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_agblock_t agblocks = pag->block_count;
+
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (M_IGEO(mp)->inobt_mxr[0] == 0)
+ return 0;
+
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
+ agblocks -= mp->m_sb.sb_logblocks;
+
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
+ (uint64_t)agblocks * mp->m_sb.sb_inopblock /
+ XFS_INODES_PER_CHUNK);
+}
+
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_btnum_t which,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(*agi_bpp == NULL);
+ ASSERT(*curpp == NULL);
+
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ if (error)
+ return error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which);
+ *curpp = cur;
+ return 0;
+}
+
+static int
+xfs_inobt_count_blocks(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ int error;
+
+ error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp);
+ if (error)
+ return error;
+
+ error = xfs_btree_count_blocks(cur, tree_blocks);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agbp);
+
+ return error;
+}
+
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agi *agi;
+ int error;
+
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ if (error)
+ return error;
+
+ agi = agbp->b_addr;
+ *tree_blocks = be32_to_cpu(agi->agi_fblocks);
+ xfs_trans_brelse(tp, agbp);
+ return 0;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ xfs_extlen_t tree_len = 0;
+ int error;
+
+ if (!xfs_has_finobt(mp))
+ return 0;
+
+ if (xfs_has_inobtcounts(mp))
+ error = xfs_finobt_read_blocks(pag, tp, &tree_len);
+ else
+ error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
+ &tree_len);
+ if (error)
+ return error;
+
+ *ask += xfs_inobt_max_size(pag);
+ *used += tree_len;
+ return 0;
+}
+
+/* Calculate the inobt btree size for some records. */
+xfs_extlen_t
+xfs_iallocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
+}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+ xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+ xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_inobt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_inobt_cur_cache);
+ xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000..26451cb76
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) \
+ (xfs_has_crc(((mp))) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+ ((xfs_inobt_rec_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+ ((xfs_inobt_key_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_inobt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_inobt_key_t) + \
+ ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+ struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec) 0
+#endif /* DEBUG */
+
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_perag *pag, xfs_btnum_t btnum,
+ struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
+#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
new file mode 100644
index 000000000..773cf4349
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -0,0 +1,1050 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+
+/*
+ * In-core extent record layout:
+ *
+ * +-------+----------------------------+
+ * | 00:53 | all 54 bits of startoff |
+ * | 54:63 | low 10 bits of startblock |
+ * +-------+----------------------------+
+ * | 00:20 | all 21 bits of length |
+ * | 21 | unwritten extent bit |
+ * | 22:63 | high 42 bits of startblock |
+ * +-------+----------------------------+
+ */
+#define XFS_IEXT_STARTOFF_MASK xfs_mask64lo(BMBT_STARTOFF_BITLEN)
+#define XFS_IEXT_LENGTH_MASK xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
+#define XFS_IEXT_STARTBLOCK_MASK xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
+
+struct xfs_iext_rec {
+ uint64_t lo;
+ uint64_t hi;
+};
+
+/*
+ * Given that the length can't be a zero, only an empty hi value indicates an
+ * unused record.
+ */
+static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
+{
+ return rec->hi == 0;
+}
+
+static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
+{
+ rec->lo = 0;
+ rec->hi = 0;
+}
+
+static void
+xfs_iext_set(
+ struct xfs_iext_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
+ ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
+ ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
+
+ rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
+ rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
+
+ rec->lo |= (irec->br_startblock << 54);
+ rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
+
+ if (irec->br_state == XFS_EXT_UNWRITTEN)
+ rec->hi |= (1 << 21);
+}
+
+static void
+xfs_iext_get(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_iext_rec *rec)
+{
+ irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ irec->br_startblock = rec->lo >> 54;
+ irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
+
+ if (rec->hi & (1 << 21))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
+}
+
+enum {
+ NODE_SIZE = 256,
+ KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
+ RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
+ sizeof(struct xfs_iext_rec),
+};
+
+/*
+ * In-core extent btree block layout:
+ *
+ * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
+ *
+ * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
+ * contain the startoffset, blockcount, startblock and unwritten extent flag.
+ * See above for the exact format, followed by pointers to the previous and next
+ * leaf blocks (if there are any).
+ *
+ * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
+ * by an equal number of pointers to the btree blocks at the next lower level.
+ *
+ * +-------+-------+-------+-------+-------+----------+----------+
+ * Leaf: | rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
+ * +-------+-------+-------+-------+-------+----------+----------+
+ *
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ * Inner: | key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ */
+struct xfs_iext_node {
+ uint64_t keys[KEYS_PER_NODE];
+#define XFS_IEXT_KEY_INVALID (1ULL << 63)
+ void *ptrs[KEYS_PER_NODE];
+};
+
+struct xfs_iext_leaf {
+ struct xfs_iext_rec recs[RECS_PER_LEAF];
+ struct xfs_iext_leaf *prev;
+ struct xfs_iext_leaf *next;
+};
+
+inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
+{
+ return ifp->if_bytes / sizeof(struct xfs_iext_rec);
+}
+
+static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
+{
+ if (ifp->if_height == 1)
+ return xfs_iext_count(ifp);
+ return RECS_PER_LEAF;
+}
+
+static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
+{
+ return &cur->leaf->recs[cur->pos];
+}
+
+static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf)
+ return false;
+ if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
+ return false;
+ if (xfs_iext_rec_is_empty(cur_rec(cur)))
+ return false;
+ return true;
+}
+
+static void *
+xfs_iext_find_first_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ node = node->ptrs[0];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+static void *
+xfs_iext_find_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (!node->ptrs[i])
+ break;
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+void
+xfs_iext_first(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ cur->pos = 0;
+ cur->leaf = xfs_iext_find_first_leaf(ifp);
+}
+
+void
+xfs_iext_last(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ int i;
+
+ cur->leaf = xfs_iext_find_last_leaf(ifp);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return;
+ }
+
+ for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
+ break;
+ }
+ cur->pos = i - 1;
+}
+
+void
+xfs_iext_next(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_first(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos < xfs_iext_max_recs(ifp));
+
+ cur->pos++;
+ if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
+ cur->leaf->next) {
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ }
+}
+
+void
+xfs_iext_prev(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_last(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos <= RECS_PER_LEAF);
+
+recurse:
+ do {
+ cur->pos--;
+ if (xfs_iext_valid(ifp, cur))
+ return;
+ } while (cur->pos > 0);
+
+ if (ifp->if_height > 1 && cur->leaf->prev) {
+ cur->leaf = cur->leaf->prev;
+ cur->pos = RECS_PER_LEAF;
+ goto recurse;
+ }
+}
+
+static inline int
+xfs_iext_key_cmp(
+ struct xfs_iext_node *node,
+ int n,
+ xfs_fileoff_t offset)
+{
+ if (node->keys[n] > offset)
+ return 1;
+ if (node->keys[n] < offset)
+ return -1;
+ return 0;
+}
+
+static inline int
+xfs_iext_rec_cmp(
+ struct xfs_iext_rec *rec,
+ xfs_fileoff_t offset)
+{
+ uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ uint32_t rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ if (rec_offset > offset)
+ return 1;
+ if (rec_offset + rec_len <= offset)
+ return -1;
+ return 0;
+}
+
+static void *
+xfs_iext_find_level(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ int level)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+
+ node = node->ptrs[i - 1];
+ if (!node)
+ break;
+ }
+
+ return node;
+}
+
+static int
+xfs_iext_node_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 1; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+ }
+
+ return i - 1;
+}
+
+static int
+xfs_iext_node_insert_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ return i;
+ }
+
+ return KEYS_PER_NODE;
+}
+
+static int
+xfs_iext_node_nr_entries(
+ struct xfs_iext_node *node,
+ int start)
+{
+ int i;
+
+ for (i = start; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ }
+
+ return i;
+}
+
+static int
+xfs_iext_leaf_nr_entries(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_leaf *leaf,
+ int start)
+{
+ int i;
+
+ for (i = start; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&leaf->recs[i]))
+ break;
+ }
+
+ return i;
+}
+
+static inline uint64_t
+xfs_iext_leaf_key(
+ struct xfs_iext_leaf *leaf,
+ int n)
+{
+ return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
+}
+
+static void
+xfs_iext_grow(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ int i;
+
+ if (ifp->if_height == 1) {
+ struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+
+ node->keys[0] = xfs_iext_leaf_key(prev, 0);
+ node->ptrs[0] = prev;
+ } else {
+ struct xfs_iext_node *prev = ifp->if_u1.if_root;
+
+ ASSERT(ifp->if_height > 1);
+
+ node->keys[0] = prev->keys[0];
+ node->ptrs[0] = prev;
+ }
+
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ node->keys[i] = XFS_IEXT_KEY_INVALID;
+
+ ifp->if_u1.if_root = node;
+ ifp->if_height++;
+}
+
+static void
+xfs_iext_update_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t old_offset,
+ xfs_fileoff_t new_offset,
+ int level,
+ void *ptr)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
+ break;
+ if (node->keys[i] == old_offset)
+ node->keys[i] = new_offset;
+ }
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ ASSERT(node == ptr);
+}
+
+static struct xfs_iext_node *
+xfs_iext_split_node(
+ struct xfs_iext_node **nodep,
+ int *pos,
+ int *nr_entries)
+{
+ struct xfs_iext_node *node = *nodep;
+ struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = KEYS_PER_NODE / 2;
+ int nr_keep = nr_move + (KEYS_PER_NODE & 1);
+ int i = 0;
+
+ /* for sequential append operations just spill over into the new node */
+ if (*pos == KEYS_PER_NODE) {
+ *nodep = new;
+ *pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+
+ for (i = 0; i < nr_move; i++) {
+ new->keys[i] = node->keys[nr_keep + i];
+ new->ptrs[i] = node->ptrs[nr_keep + i];
+
+ node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_keep + i] = NULL;
+ }
+
+ if (*pos >= nr_keep) {
+ *nodep = new;
+ *pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ for (; i < KEYS_PER_NODE; i++)
+ new->keys[i] = XFS_IEXT_KEY_INVALID;
+ return new;
+}
+
+static void
+xfs_iext_insert_node(
+ struct xfs_ifork *ifp,
+ uint64_t offset,
+ void *ptr,
+ int level)
+{
+ struct xfs_iext_node *node, *new;
+ int i, pos, nr_entries;
+
+again:
+ if (ifp->if_height < level)
+ xfs_iext_grow(ifp);
+
+ new = NULL;
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_insert_pos(node, offset);
+ nr_entries = xfs_iext_node_nr_entries(node, pos);
+
+ ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
+ ASSERT(nr_entries <= KEYS_PER_NODE);
+
+ if (nr_entries == KEYS_PER_NODE)
+ new = xfs_iext_split_node(&node, &pos, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (node != new && pos == 0 && nr_entries > 0)
+ xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
+
+ for (i = nr_entries; i > pos; i--) {
+ node->keys[i] = node->keys[i - 1];
+ node->ptrs[i] = node->ptrs[i - 1];
+ }
+ node->keys[pos] = offset;
+ node->ptrs[pos] = ptr;
+
+ if (new) {
+ offset = new->keys[0];
+ ptr = new;
+ level++;
+ goto again;
+ }
+}
+
+static struct xfs_iext_leaf *
+xfs_iext_split_leaf(
+ struct xfs_iext_cursor *cur,
+ int *nr_entries)
+{
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = RECS_PER_LEAF / 2;
+ int nr_keep = nr_move + (RECS_PER_LEAF & 1);
+ int i;
+
+ /* for sequential append operations just spill over into the new node */
+ if (cur->pos == RECS_PER_LEAF) {
+ cur->leaf = new;
+ cur->pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+ for (i = 0; i < nr_move; i++) {
+ new->recs[i] = leaf->recs[nr_keep + i];
+ xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
+ }
+
+ if (cur->pos >= nr_keep) {
+ cur->leaf = new;
+ cur->pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ if (leaf->next)
+ leaf->next->prev = new;
+ new->next = leaf->next;
+ new->prev = leaf;
+ leaf->next = new;
+ return new;
+}
+
+static void
+xfs_iext_alloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ ASSERT(ifp->if_bytes == 0);
+
+ ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_height = 1;
+
+ /* now that we have a node step into it */
+ cur->leaf = ifp->if_u1.if_root;
+ cur->pos = 0;
+}
+
+static void
+xfs_iext_realloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
+ void *new;
+
+ /* account for the prev/next pointers */
+ if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
+ new_size = NODE_SIZE;
+
+ new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+ memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
+ ifp->if_u1.if_root = new;
+ cur->leaf = new;
+}
+
+/*
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
+ */
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
+{
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+}
+
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ xfs_fileoff_t offset = irec->br_startoff;
+ struct xfs_iext_leaf *new = NULL;
+ int nr_entries, i;
+
+ xfs_iext_inc_seq(ifp);
+
+ if (ifp->if_height == 0)
+ xfs_iext_alloc_root(ifp, cur);
+ else if (ifp->if_height == 1)
+ xfs_iext_realloc_root(ifp, cur);
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
+ ASSERT(nr_entries <= RECS_PER_LEAF);
+ ASSERT(cur->pos >= nr_entries ||
+ xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
+
+ if (nr_entries == RECS_PER_LEAF)
+ new = xfs_iext_split_leaf(cur, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
+ offset, 1, cur->leaf);
+ }
+
+ for (i = nr_entries; i > cur->pos; i--)
+ cur->leaf->recs[i] = cur->leaf->recs[i - 1];
+ xfs_iext_set(cur_rec(cur), irec);
+ ifp->if_bytes += sizeof(struct xfs_iext_rec);
+
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
+ if (new)
+ xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
+}
+
+static struct xfs_iext_node *
+xfs_iext_rebalance_node(
+ struct xfs_iext_node *parent,
+ int *pos,
+ struct xfs_iext_node *node,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full, or have different
+ * parents, we might never be able to merge our node, and will only
+ * delete it once the number of entries hits zero.
+ */
+ if (nr_entries == 0)
+ return node;
+
+ if (*pos > 0) {
+ struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
+ int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
+
+ if (nr_prev + nr_entries <= KEYS_PER_NODE) {
+ for (i = 0; i < nr_entries; i++) {
+ prev->keys[nr_prev + i] = node->keys[i];
+ prev->ptrs[nr_prev + i] = node->ptrs[i];
+ }
+ return node;
+ }
+ }
+
+ if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
+ struct xfs_iext_node *next = parent->ptrs[*pos + 1];
+ int nr_next = xfs_iext_node_nr_entries(next, 0), i;
+
+ if (nr_entries + nr_next <= KEYS_PER_NODE) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ node->keys[nr_entries + i] = next->keys[i];
+ node->ptrs[nr_entries + i] = next->ptrs[i];
+ }
+
+ ++*pos;
+ return next;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+xfs_iext_remove_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ void *victim)
+{
+ struct xfs_iext_node *node, *parent;
+ int level = 2, pos, nr_entries, i;
+
+ ASSERT(level <= ifp->if_height);
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(node, offset);
+again:
+ ASSERT(node->ptrs[pos]);
+ ASSERT(node->ptrs[pos] == victim);
+ kmem_free(victim);
+
+ nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
+ offset = node->keys[0];
+ for (i = pos; i < nr_entries; i++) {
+ node->keys[i] = node->keys[i + 1];
+ node->ptrs[i] = node->ptrs[i + 1];
+ }
+ node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_entries] = NULL;
+
+ if (pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
+ offset = node->keys[0];
+ }
+
+ if (nr_entries >= KEYS_PER_NODE / 2)
+ return;
+
+ if (level < ifp->if_height) {
+ /*
+ * If we aren't at the root yet try to find a neighbour node to
+ * merge with (or delete the node if it is empty), and then
+ * recurse up to the next level.
+ */
+ level++;
+ parent = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(parent, offset);
+
+ ASSERT(pos != KEYS_PER_NODE);
+ ASSERT(parent->ptrs[pos] == node);
+
+ node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
+ if (node) {
+ victim = node;
+ node = parent;
+ goto again;
+ }
+ } else if (nr_entries == 1) {
+ /*
+ * If we are at the root and only one entry is left we can just
+ * free this node and update the root pointer.
+ */
+ ASSERT(node == ifp->if_u1.if_root);
+ ifp->if_u1.if_root = node->ptrs[0];
+ ifp->if_height--;
+ kmem_free(node);
+ }
+}
+
+static void
+xfs_iext_rebalance_leaf(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_iext_leaf *leaf,
+ xfs_fileoff_t offset,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full we might never be able
+ * to merge our node, and will only delete it once the number of
+ * entries hits zero.
+ */
+ if (nr_entries == 0)
+ goto remove_node;
+
+ if (leaf->prev) {
+ int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
+
+ if (nr_prev + nr_entries <= RECS_PER_LEAF) {
+ for (i = 0; i < nr_entries; i++)
+ leaf->prev->recs[nr_prev + i] = leaf->recs[i];
+
+ if (cur->leaf == leaf) {
+ cur->leaf = leaf->prev;
+ cur->pos += nr_prev;
+ }
+ goto remove_node;
+ }
+ }
+
+ if (leaf->next) {
+ int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
+
+ if (nr_entries + nr_next <= RECS_PER_LEAF) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ leaf->recs[nr_entries + i] =
+ leaf->next->recs[i];
+ }
+
+ if (cur->leaf == leaf->next) {
+ cur->leaf = leaf;
+ cur->pos += nr_entries;
+ }
+
+ offset = xfs_iext_leaf_key(leaf->next, 0);
+ leaf = leaf->next;
+ goto remove_node;
+ }
+ }
+
+ return;
+remove_node:
+ if (leaf->prev)
+ leaf->prev->next = leaf->next;
+ if (leaf->next)
+ leaf->next->prev = leaf->prev;
+ xfs_iext_remove_node(ifp, offset, leaf);
+}
+
+static void
+xfs_iext_free_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ ifp->if_height--;
+ kmem_free(ifp->if_u1.if_root);
+ ifp->if_u1.if_root = NULL;
+}
+
+void
+xfs_iext_remove(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ xfs_fileoff_t offset = xfs_iext_leaf_key(leaf, 0);
+ int i, nr_entries;
+
+ trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
+
+ ASSERT(ifp->if_height > 0);
+ ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(xfs_iext_valid(ifp, cur));
+
+ xfs_iext_inc_seq(ifp);
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
+ for (i = cur->pos; i < nr_entries; i++)
+ leaf->recs[i] = leaf->recs[i + 1];
+ xfs_iext_rec_clear(&leaf->recs[nr_entries]);
+ ifp->if_bytes -= sizeof(struct xfs_iext_rec);
+
+ if (cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
+ leaf);
+ offset = xfs_iext_leaf_key(leaf, 0);
+ } else if (cur->pos == nr_entries) {
+ if (ifp->if_height > 1 && leaf->next)
+ cur->leaf = leaf->next;
+ else
+ cur->leaf = NULL;
+ cur->pos = 0;
+ }
+
+ if (nr_entries >= RECS_PER_LEAF / 2)
+ return;
+
+ if (ifp->if_height > 1)
+ xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
+ else if (nr_entries == 0)
+ xfs_iext_free_last_leaf(ifp);
+}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent cursor in *cur.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its cursor in *cur
+ * instead.
+ * If bno is beyond the last extent return false, and return an invalid
+ * cursor value.
+ */
+bool
+xfs_iext_lookup_extent(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+ cur->leaf = xfs_iext_find_level(ifp, offset, 1);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return false;
+ }
+
+ for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
+ struct xfs_iext_rec *rec = cur_rec(cur);
+
+ if (xfs_iext_rec_is_empty(rec))
+ break;
+ if (xfs_iext_rec_cmp(rec, offset) >= 0)
+ goto found;
+ }
+
+ /* Try looking in the next node for an entry > offset */
+ if (ifp->if_height == 1 || !cur->leaf->next)
+ return false;
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+found:
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * Returns the last extent before end, and if this extent doesn't cover
+ * end, update end to the end of the extent.
+ */
+bool
+xfs_iext_lookup_extent_before(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ /* could be optimized to not even look up the next on a match.. */
+ if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
+ gotp->br_startoff <= *end - 1)
+ return true;
+ if (!xfs_iext_prev_extent(ifp, cur, gotp))
+ return false;
+ *end = gotp->br_startoff + gotp->br_blockcount;
+ return true;
+}
+
+void
+xfs_iext_update_extent(
+ struct xfs_inode *ip,
+ int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *new)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ xfs_iext_inc_seq(ifp);
+
+ if (cur->pos == 0) {
+ struct xfs_bmbt_irec old;
+
+ xfs_iext_get(&old, cur_rec(cur));
+ if (new->br_startoff != old.br_startoff) {
+ xfs_iext_update_node(ifp, old.br_startoff,
+ new->br_startoff, 1, cur->leaf);
+ }
+ }
+
+ trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
+ xfs_iext_set(cur_rec(cur), new);
+ trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
+}
+
+/*
+ * Return true if the cursor points at an extent and return the extent structure
+ * in gotp. Else return false.
+ */
+bool
+xfs_iext_get_extent(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * This is a recursive function, because of that we need to be extremely
+ * careful with stack usage.
+ */
+static void
+xfs_iext_destroy_node(
+ struct xfs_iext_node *node,
+ int level)
+{
+ int i;
+
+ if (level > 1) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ xfs_iext_destroy_node(node->ptrs[i], level - 1);
+ }
+ }
+
+ kmem_free(node);
+}
+
+void
+xfs_iext_destroy(
+ struct xfs_ifork *ifp)
+{
+ xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+
+ ifp->if_bytes = 0;
+ ifp->if_height = 0;
+ ifp->if_u1.if_root = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000..758aacd81
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_inode.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+
+#include <linux/iversion.h>
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecessary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behaviour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
+ */
+static void
+xfs_inode_buf_verify(
+ struct xfs_buf *bp,
+ bool readahead)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ int i;
+ int ni;
+
+ /*
+ * Validate the magic number and version of every inode in the buffer
+ */
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++) {
+ struct xfs_dinode *dip;
+ xfs_agino_t unlinked_ino;
+ int di_ok;
+
+ dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
+ unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
+ xfs_dinode_good_version(mp, dip->di_version) &&
+ xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
+ if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+ XFS_ERRTAG_ITOBP_INOTOBP))) {
+ if (readahead) {
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
+ return;
+ }
+
+#ifdef DEBUG
+ xfs_alert(mp,
+ "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+ (unsigned long long)xfs_buf_daddr(bp), i,
+ be16_to_cpu(dip->di_magic));
+#endif
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+ __func__, dip, sizeof(*dip),
+ NULL);
+ return;
+ }
+ }
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
+ .verify_read = xfs_inode_buf_read_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
+ .verify_read = xfs_inode_buf_readahead_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode. It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter.
+ */
+int
+xfs_imap_to_bp(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_imap *imap,
+ struct xfs_buf **bpp)
+{
+ return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ imap->im_len, XBF_UNMAPPED, bpp,
+ &xfs_inode_buf_ops);
+}
+
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+ struct timespec64 tv;
+ uint32_t n;
+
+ tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+ tv.tv_nsec = n;
+
+ return tv;
+}
+
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+ struct xfs_legacy_timestamp *lts;
+
+ if (xfs_dinode_has_bigtime(dip))
+ return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+ tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+ return tv;
+}
+
+int
+xfs_inode_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *from)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+ xfs_failaddr_t fa;
+
+ ASSERT(ip->i_cowfp == NULL);
+
+ fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from,
+ sizeof(*from), fa);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * First get the permanent information that is needed to allocate an
+ * inode. If the inode is unused, mode is zero and we shouldn't mess
+ * with the uninitialized part of it.
+ */
+ if (!xfs_has_v3inodes(ip->i_mount))
+ ip->i_flushiter = be16_to_cpu(from->di_flushiter);
+ inode->i_generation = be32_to_cpu(from->di_gen);
+ inode->i_mode = be16_to_cpu(from->di_mode);
+ if (!inode->i_mode)
+ return 0;
+
+ /*
+ * Convert v1 inodes immediately to v2 inode format as this is the
+ * minimum inode version format we support in the rest of the code.
+ * They will also be unconditionally written back to disk as v2 inodes.
+ */
+ if (unlikely(from->di_version == 1)) {
+ set_nlink(inode, be16_to_cpu(from->di_onlink));
+ ip->i_projid = 0;
+ } else {
+ set_nlink(inode, be32_to_cpu(from->di_nlink));
+ ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+ be16_to_cpu(from->di_projid_lo);
+ }
+
+ i_uid_write(inode, be32_to_cpu(from->di_uid));
+ i_gid_write(inode, be32_to_cpu(from->di_gid));
+
+ /*
+ * Time is signed, so need to convert to signed 32 bit before
+ * storing in inode timestamp which may be 64 bit. Otherwise
+ * a time before epoch is converted to a time long after epoch
+ * on 64 bit systems.
+ */
+ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+ inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+
+ ip->i_disk_size = be64_to_cpu(from->di_size);
+ ip->i_nblocks = be64_to_cpu(from->di_nblocks);
+ ip->i_extsize = be32_to_cpu(from->di_extsize);
+ ip->i_forkoff = from->di_forkoff;
+ ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
+
+ if (from->di_dmevmask || from->di_dmstate)
+ xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
+
+ if (xfs_has_v3inodes(ip->i_mount)) {
+ inode_set_iversion_queried(inode,
+ be64_to_cpu(from->di_changecount));
+ ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
+ ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ }
+
+ error = xfs_iformat_data_fork(ip, from);
+ if (error)
+ return error;
+ if (from->di_forkoff) {
+ error = xfs_iformat_attr_fork(ip, from);
+ if (error)
+ goto out_destroy_data_fork;
+ }
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+ return 0;
+
+out_destroy_data_fork:
+ xfs_idestroy_fork(&ip->i_df);
+ return error;
+}
+
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_timestamp *lts;
+ xfs_timestamp_t ts;
+
+ if (xfs_inode_has_bigtime(ip))
+ return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lts->t_sec = cpu_to_be32(tv.tv_sec);
+ lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+ return ts;
+}
+
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
+ }
+}
+
+void
+xfs_inode_to_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
+{
+ struct inode *inode = VFS_I(ip);
+
+ to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ to->di_onlink = 0;
+
+ to->di_format = xfs_ifork_format(&ip->i_df);
+ to->di_uid = cpu_to_be32(i_uid_read(inode));
+ to->di_gid = cpu_to_be32(i_gid_read(inode));
+ to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
+ to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
+
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
+ to->di_nlink = cpu_to_be32(inode->i_nlink);
+ to->di_gen = cpu_to_be32(inode->i_generation);
+ to->di_mode = cpu_to_be16(inode->i_mode);
+
+ to->di_size = cpu_to_be64(ip->i_disk_size);
+ to->di_nblocks = cpu_to_be64(ip->i_nblocks);
+ to->di_extsize = cpu_to_be32(ip->i_extsize);
+ to->di_forkoff = ip->i_forkoff;
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
+ to->di_flags = cpu_to_be16(ip->i_diflags);
+
+ if (xfs_has_v3inodes(ip->i_mount)) {
+ to->di_version = 3;
+ to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
+ to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
+ to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
+ to->di_ino = cpu_to_be64(ip->i_ino);
+ to->di_lsn = cpu_to_be64(lsn);
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+ to->di_v3_pad = 0;
+ } else {
+ to->di_version = 2;
+ to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+ }
+
+ xfs_inode_to_disk_iext_counters(ip, to);
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_fork(
+ struct xfs_dinode *dip,
+ struct xfs_mount *mp,
+ int whichfork)
+{
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * No local regular files yet.
+ */
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
+ if (di_nextents)
+ return __this_address;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork))
+ return __this_address;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_forkoff(
+ struct xfs_dinode *dip,
+ struct xfs_mount *mp)
+{
+ if (!dip->di_forkoff)
+ return NULL;
+
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
+ return __this_address;
+ break;
+ case XFS_DINODE_FMT_LOCAL: /* fall through ... */
+ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
+ case XFS_DINODE_FMT_BTREE:
+ if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_dinode_verify(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip)
+{
+ xfs_failaddr_t fa;
+ uint16_t mode;
+ uint16_t flags;
+ uint64_t flags2;
+ uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
+
+ if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+ return __this_address;
+
+ /* Verify v3 integrity information first */
+ if (dip->di_version >= 3) {
+ if (!xfs_has_v3inodes(mp))
+ return __this_address;
+ if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF))
+ return __this_address;
+ if (be64_to_cpu(dip->di_ino) != ino)
+ return __this_address;
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ }
+
+ /* don't allow invalid i_size */
+ di_size = be64_to_cpu(dip->di_size);
+ if (di_size & (1ULL << 63))
+ return __this_address;
+
+ mode = be16_to_cpu(dip->di_mode);
+ if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
+ return __this_address;
+
+ /* No zero-length symlinks/dirs. */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
+ return __this_address;
+
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
+ /* Fork checks carried over from xfs_iformat_fork */
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
+ return __this_address;
+
+ if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
+ return __this_address;
+
+ flags = be16_to_cpu(dip->di_flags);
+
+ if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
+ return __this_address;
+
+ /* check for illegal values of forkoff */
+ fa = xfs_dinode_verify_forkoff(dip, mp);
+ if (fa)
+ return fa;
+
+ /* Do we have appropriate data fork formats for the mode? */
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (dip->di_format != XFS_DINODE_FMT_DEV)
+ return __this_address;
+ break;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK);
+ if (fa)
+ return fa;
+ break;
+ case 0:
+ /* Uninitialized inode ok. */
+ break;
+ default:
+ return __this_address;
+ }
+
+ if (dip->di_forkoff) {
+ fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK);
+ if (fa)
+ return fa;
+ } else {
+ /*
+ * If there is no fork offset, this may be a freshly-made inode
+ * in a new disk cluster, in which case di_aformat is zeroed.
+ * Otherwise, such an inode must be in EXTENTS format; this goes
+ * for freed inodes as well.
+ */
+ switch (dip->di_aformat) {
+ case 0:
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ return __this_address;
+ }
+ if (naextents)
+ return __this_address;
+ }
+
+ /* extent size hint validation */
+ fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa)
+ return fa;
+
+ /* only version 3 or greater inodes are extensively verified here */
+ if (dip->di_version < 3)
+ return NULL;
+
+ flags2 = be64_to_cpu(dip->di_flags2);
+
+ /* don't allow reflink/cowextsize if we don't have reflink */
+ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+ !xfs_has_reflink(mp))
+ return __this_address;
+
+ /* only regular files get reflink */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG)
+ return __this_address;
+
+ /* don't let reflink and realtime mix */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ return __this_address;
+
+ /* COW extent size hint validation */
+ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa)
+ return fa;
+
+ /* bigtime iflag can only happen on bigtime filesystems */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_has_bigtime(mp))
+ return __this_address;
+
+ return NULL;
+}
+
+void
+xfs_dinode_calc_crc(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ uint32_t crc;
+
+ if (dip->di_version < 3)
+ return;
+
+ ASSERT(xfs_has_crc(mp));
+ crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF);
+ dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Validate di_extsize hint.
+ *
+ * 1. Extent size hint is only valid for directories and regular files.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files.
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. Hint cannot be larger than MAXTEXTLEN.
+ * 5. Can be changed on directories at any time.
+ * 6. Hint value of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * For realtime files, this is the rt extent size.
+ * 8. For non-realtime files, the extent size hint must be limited
+ * to half the AG size to avoid alignment extending the extent beyond the
+ * limits of the AG.
+ */
+xfs_failaddr_t
+xfs_inode_validate_extsize(
+ struct xfs_mount *mp,
+ uint32_t extsize,
+ uint16_t mode,
+ uint16_t flags)
+{
+ bool rt_flag;
+ bool hint_flag;
+ bool inherit_flag;
+ uint32_t extsize_bytes;
+ uint32_t blocksize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
+ inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
+ extsize_bytes = XFS_FSB_TO_B(mp, extsize);
+
+ /*
+ * This comment describes a historic gap in this verifier function.
+ *
+ * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this
+ * function has never checked that the extent size hint is an integer
+ * multiple of the realtime extent size. Since we allow users to set
+ * this combination on non-rt filesystems /and/ to change the rt
+ * extent size when adding a rt device to a filesystem, the net effect
+ * is that users can configure a filesystem anticipating one rt
+ * geometry and change their minds later. Directories do not use the
+ * extent size hint, so this is harmless for them.
+ *
+ * If a directory with a misaligned extent size hint is allowed to
+ * propagate that hint into a new regular realtime file, the result
+ * is that the inode cluster buffer verifier will trigger a corruption
+ * shutdown the next time it is run, because the verifier has always
+ * enforced the alignment rule for regular files.
+ *
+ * Because we allow administrators to set a new rt extent size when
+ * adding a rt section, we cannot add a check to this verifier because
+ * that will result a new source of directory corruption errors when
+ * reading an existing filesystem. Instead, we rely on callers to
+ * decide when alignment checks are appropriate, and fix things up as
+ * needed.
+ */
+
+ if (rt_flag)
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
+ if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
+ return __this_address;
+
+ if (hint_flag && !S_ISREG(mode))
+ return __this_address;
+
+ if (inherit_flag && !S_ISDIR(mode))
+ return __this_address;
+
+ if ((hint_flag || inherit_flag) && extsize == 0)
+ return __this_address;
+
+ /* free inodes get flags set to zero but extsize remains */
+ if (mode && !(hint_flag || inherit_flag) && extsize != 0)
+ return __this_address;
+
+ if (extsize_bytes % blocksize_bytes)
+ return __this_address;
+
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
+ return __this_address;
+
+ if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
+ return __this_address;
+
+ return NULL;
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ * The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ * for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. Hint value of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ * alignment extending the extent beyond the limits of the AG.
+ */
+xfs_failaddr_t
+xfs_inode_validate_cowextsize(
+ struct xfs_mount *mp,
+ uint32_t cowextsize,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ bool rt_flag;
+ bool hint_flag;
+ uint32_t cowextsize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
+ cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+
+ if (hint_flag && !xfs_has_reflink(mp))
+ return __this_address;
+
+ if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
+ return __this_address;
+
+ if (hint_flag && cowextsize == 0)
+ return __this_address;
+
+ /* free inodes get flags set to zero but cowextsize remains */
+ if (mode && !hint_flag && cowextsize != 0)
+ return __this_address;
+
+ if (hint_flag && rt_flag)
+ return __this_address;
+
+ if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+ return __this_address;
+
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
+ return __this_address;
+
+ if (cowextsize > mp->m_sb.sb_agblocks / 2)
+ return __this_address;
+
+ return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644
index 000000000..585ed5a11
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_INODE_BUF_H__
+#define __XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+
+/*
+ * Inode location information. Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+ xfs_daddr_t im_blkno; /* starting BB of inode chunk */
+ unsigned short im_len; /* length in BBs of inode chunk */
+ unsigned short im_boffset; /* inode offset in block in bytes */
+};
+
+int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_imap *imap, struct xfs_buf **bpp);
+void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip);
+void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
+ xfs_lsn_t lsn);
+int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+
+xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
+ struct xfs_dinode *dip);
+xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
+ uint32_t extsize, uint16_t mode, uint16_t flags);
+xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
+ uint32_t cowextsize, uint16_t mode, uint16_t flags,
+ uint64_t flags2);
+
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+ return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+ const xfs_timestamp_t ts);
+
+static inline bool
+xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version)
+{
+ if (xfs_has_v3inodes(mp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
+
+#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000..6b2176018
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_types.h"
+#include "xfs_errortag.h"
+
+struct kmem_cache *xfs_ifork_cache;
+
+void
+xfs_init_local_fork(
+ struct xfs_inode *ip,
+ int whichfork,
+ const void *data,
+ int64_t size)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int mem_size = size;
+ bool zero_terminate;
+
+ /*
+ * If we are using the local fork to store a symlink body we need to
+ * zero-terminate it so that we can pass it back to the VFS directly.
+ * Overallocate the in-memory fork by one for that and add a zero
+ * to terminate it below.
+ */
+ zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+ if (zero_terminate)
+ mem_size++;
+
+ if (size) {
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
+ memcpy(ifp->if_u1.if_data, data, size);
+ if (zero_terminate)
+ ifp->if_u1.if_data[size] = '\0';
+ } else {
+ ifp->if_u1.if_data = NULL;
+ }
+
+ ifp->if_bytes = size;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ */
+STATIC int
+xfs_iformat_local(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork,
+ int size)
+{
+ /*
+ * If the size is unreasonable, then something
+ * is wrong and we just bail out rather than crash in
+ * kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %llu (bad size %d for local fork, size = %zd).",
+ (unsigned long long) ip->i_ino, size,
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_local", dip, sizeof(*dip),
+ __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
+ return 0;
+}
+
+/*
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode.
+ */
+STATIC int
+xfs_iformat_extents(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int state = xfs_bmap_fork_to_state(whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
+ int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_rec *dp;
+ struct xfs_bmbt_irec new;
+ int i;
+
+ /*
+ * If the number of extents is unreasonable, then something is wrong and
+ * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_extents(1)", dip, sizeof(*dip),
+ __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+ if (size) {
+ dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+
+ xfs_iext_first(ifp, &icur);
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(dp, &new);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_extents(2)",
+ dp, sizeof(*dp), fa);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_iext_insert(ip, &icur, &new, state);
+ trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+ xfs_iext_next(ifp, &icur);
+ }
+ }
+ return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it. The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_bmdr_block_t *dfp;
+ struct xfs_ifork *ifp;
+ /* REFERENCED */
+ int nrecs;
+ int size;
+ int level;
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+ size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ /*
+ * blow out if -- fork has less extents than can fit in
+ * fork (fork shouldn't be a btree format), root btree
+ * block has more records than can fit into the fork,
+ * or the number of extents is greater than the number of
+ * blocks.
+ */
+ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
+ nrecs == 0 ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, mp, whichfork) ||
+ ifp->if_nextents > ip->i_nblocks) ||
+ level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
+ xfs_warn(mp, "corrupt inode %llu (btree).",
+ (unsigned long long) ip->i_ino);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iformat_btree", dfp, size,
+ __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ ifp->if_broot_bytes = size;
+ ifp->if_broot = kmem_alloc(size, KM_NOFS);
+ ASSERT(ifp->if_broot != NULL);
+ /*
+ * Copy and convert from the on-disk structure
+ * to the in-memory structure.
+ */
+ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+ ifp->if_broot, size);
+
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+ return 0;
+}
+
+int
+xfs_iformat_data_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ ip->i_df.if_format = dip->di_format;
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ ip->i_disk_size = 0;
+ inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
+ return 0;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (ip->i_df.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_DATA_FORK,
+ be64_to_cpu(dip->di_size));
+ if (!error)
+ error = xfs_ifork_verify_local_data(ip);
+ return error;
+ case XFS_DINODE_FMT_EXTENTS:
+ return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+ case XFS_DINODE_FMT_BTREE:
+ return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+}
+
+static uint16_t
+xfs_dfork_attr_shortform_size(
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_shortform *atp =
+ (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+
+ return be16_to_cpu(atp->hdr.totsize);
+}
+
+void
+xfs_ifork_init_attr(
+ struct xfs_inode *ip,
+ enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents)
+{
+ ip->i_af.if_format = format;
+ ip->i_af.if_nextents = nextents;
+}
+
+void
+xfs_ifork_zap_attr(
+ struct xfs_inode *ip)
+{
+ xfs_idestroy_fork(&ip->i_af);
+ memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+}
+
+int
+xfs_iformat_attr_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
+ int error = 0;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
+
+ switch (ip->i_af.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
+ xfs_dfork_attr_shortform_size(dip));
+ if (!error)
+ error = xfs_ifork_verify_local_attr(ip);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+ break;
+ default:
+ xfs_inode_verifier_error(ip, error, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ if (error)
+ xfs_ifork_zap_attr(ip);
+ return error;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff. Move the records
+ * and pointers in if_broot to fit the new size. When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller. When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root. If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated. The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ * requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+ xfs_inode_t *ip,
+ int rec_diff,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int cur_max;
+ struct xfs_ifork *ifp;
+ struct xfs_btree_block *new_broot;
+ int new_max;
+ size_t new_size;
+ char *np;
+ char *op;
+
+ /*
+ * Handle the degenerate case quietly.
+ */
+ if (rec_diff == 0) {
+ return;
+ }
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ if (rec_diff > 0) {
+ /*
+ * If there wasn't any memory allocated before, just
+ * allocate it now and get out.
+ */
+ if (ifp->if_broot_bytes == 0) {
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+ ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+ ifp->if_broot_bytes = (int)new_size;
+ return;
+ }
+
+ /*
+ * If there is already an existing if_broot, then we need
+ * to realloc() it and shift the pointers to their new
+ * location. The records don't change location because
+ * they are kept butted up against the btree block header.
+ */
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ (int)new_size);
+ ifp->if_broot_bytes = (int)new_size;
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
+ return;
+ }
+
+ /*
+ * rec_diff is less than 0. In this case, we are shrinking the
+ * if_broot buffer. It must already exist. If we go to zero
+ * records, just get rid of the root and clear the status bit.
+ */
+ ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ ASSERT(new_max >= 0);
+ if (new_max > 0)
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ else
+ new_size = 0;
+ if (new_size > 0) {
+ new_broot = kmem_alloc(new_size, KM_NOFS);
+ /*
+ * First copy over the btree block header.
+ */
+ memcpy(new_broot, ifp->if_broot,
+ XFS_BMBT_BLOCK_LEN(ip->i_mount));
+ } else {
+ new_broot = NULL;
+ }
+
+ /*
+ * Only copy the records and pointers if there are any.
+ */
+ if (new_max > 0) {
+ /*
+ * First copy the records.
+ */
+ op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+ np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+ /*
+ * Then copy the pointers.
+ */
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+ (int)new_size);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
+ }
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = new_broot;
+ ifp->if_broot_bytes = (int)new_size;
+ if (ifp->if_broot)
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased. The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer. Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ * requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+ struct xfs_inode *ip,
+ int64_t byte_diff,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int64_t new_size = ifp->if_bytes + byte_diff;
+
+ ASSERT(new_size >= 0);
+ ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
+
+ if (byte_diff == 0)
+ return;
+
+ if (new_size == 0) {
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ ifp->if_bytes = 0;
+ return;
+ }
+
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ ifp->if_bytes = new_size;
+}
+
+void
+xfs_idestroy_fork(
+ struct xfs_ifork *ifp)
+{
+ if (ifp->if_broot != NULL) {
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = NULL;
+ }
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ if (ifp->if_height)
+ xfs_iext_destroy(ifp);
+ break;
+ }
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine. We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_rec *dp,
+ int whichfork)
+{
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ int64_t copied = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ ASSERT(ifp->if_bytes > 0);
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ ASSERT(xfs_bmap_validate_extent(ip, whichfork, &rec) == NULL);
+ xfs_bmbt_disk_set_all(dp, &rec);
+ trace_xfs_write_extent(ip, &icur, state, _RET_IP_);
+ copied += sizeof(struct xfs_bmbt_rec);
+ dp++;
+ }
+
+ ASSERT(copied > 0);
+ ASSERT(copied <= ifp->if_bytes);
+ return copied;
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_item *iip,
+ int whichfork)
+{
+ char *cp;
+ struct xfs_ifork *ifp;
+ xfs_mount_t *mp;
+ static const short brootflag[2] =
+ { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+ static const short dataflag[2] =
+ { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+ static const short extflag[2] =
+ { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+ if (!iip)
+ return;
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ /*
+ * This can happen if we gave up in iformat in an error path,
+ * for the attribute fork.
+ */
+ if (!ifp) {
+ ASSERT(whichfork == XFS_ATTR_FORK);
+ return;
+ }
+ cp = XFS_DFORK_PTR(dip, whichfork);
+ mp = ip->i_mount;
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ if ((iip->ili_fields & dataflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
+ memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ }
+ break;
+
+ case XFS_DINODE_FMT_EXTENTS:
+ if ((iip->ili_fields & extflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(ifp->if_nextents > 0);
+ (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+ whichfork);
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ if ((iip->ili_fields & brootflag[whichfork]) &&
+ (ifp->if_broot_bytes > 0)) {
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+ (xfs_bmdr_block_t *)cp,
+ XFS_DFORK_SIZE(dip, mp, whichfork));
+ }
+ break;
+
+ case XFS_DINODE_FMT_DEV:
+ if (iip->ili_fields & XFS_ILOG_DEV) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ xfs_dinode_put_rdev(dip,
+ linux_to_xfs_dev_t(VFS_I(ip)->i_rdev));
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+ struct xfs_inode *ip,
+ int state)
+{
+ if (state & BMAP_COWFORK)
+ return ip->i_cowfp;
+ else if (state & BMAP_ATTRFORK)
+ return &ip->i_af;
+ return &ip->i_df;
+}
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+ struct xfs_inode *ip)
+{
+ if (ip->i_cowfp)
+ return;
+
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
+}
+
+/* Verify the inline contents of the data fork of an inode. */
+int
+xfs_ifork_verify_local_data(
+ struct xfs_inode *ip)
+{
+ xfs_failaddr_t fa = NULL;
+
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
+ case S_IFDIR:
+ fa = xfs_dir2_sf_verify(ip);
+ break;
+ case S_IFLNK:
+ fa = xfs_symlink_shortform_verify(ip);
+ break;
+ default:
+ break;
+ }
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+ ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Verify the inline contents of the attr fork of an inode. */
+int
+xfs_ifork_verify_local_attr(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+ xfs_failaddr_t fa;
+
+ if (!xfs_inode_has_attr_fork(ip))
+ fa = __this_address;
+ else
+ fa = xfs_attr_shortform_verify(ip);
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+ ifp->if_u1.if_data, ifp->if_bytes, fa);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+int
+xfs_iext_count_may_overflow(
+ struct xfs_inode *ip,
+ int whichfork,
+ int nr_to_add)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ uint64_t max_exts;
+ uint64_t nr_exts;
+
+ if (whichfork == XFS_COW_FORK)
+ return 0;
+
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
+
+ if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ max_exts = 10;
+
+ nr_exts = ifp->if_nextents + nr_to_add;
+ if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ return -EFBIG;
+
+ return 0;
+}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint nr_to_add)
+{
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644
index 000000000..d3943d6ad
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_INODE_FORK_H__
+#define __XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+struct xfs_dinode;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+struct xfs_ifork {
+ int64_t if_bytes; /* bytes in if_u1 */
+ struct xfs_btree_block *if_broot; /* file's incore btree root */
+ unsigned int if_seq; /* fork mod counter */
+ int if_height; /* height of the extent tree */
+ union {
+ void *if_root; /* extent tree root */
+ char *if_data; /* inline file data */
+ } if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
+ short if_broot_bytes; /* bytes allocated for root */
+ int8_t if_format; /* format of this fork */
+};
+
+/*
+ * Worst-case increase in the fork extent count when we're adding a single
+ * extent to a fork and there's no possibility of splitting an existing mapping.
+ */
+#define XFS_IEXT_ADD_NOSPLIT_CNT (1)
+
+/*
+ * Punching out an extent from the middle of an existing extent can cause the
+ * extent count to increase by 1.
+ * i.e. | Old extent | Hole | Old extent |
+ */
+#define XFS_IEXT_PUNCH_HOLE_CNT (1)
+
+/*
+ * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
+ * be added. One extra extent for dabtree in case a local attr is
+ * large enough to cause a double split. It can also cause extent
+ * count to increase proportional to the size of a remote xattr's
+ * value.
+ */
+#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \
+ (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks))
+
+/*
+ * A write to a sub-interval of an existing unwritten extent causes the original
+ * extent to be split into 3 extents
+ * i.e. | Unwritten | Real | Unwritten |
+ * Hence extent count can increase by 2.
+ */
+#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2)
+
+
+/*
+ * Moving an extent to data fork can cause a sub-interval of an existing extent
+ * to be unmapped. This will increase extent count by 1. Mapping in the new
+ * extent can increase the extent count by 1 again i.e.
+ * | Old extent | New extent | Old extent |
+ * Hence number of extents increases by 2.
+ */
+#define XFS_IEXT_REFLINK_END_COW_CNT (2)
+
+/*
+ * Removing an initial range of source/donor file's extent and adding a new
+ * extent (from donor/source file) in its place will cause extent count to
+ * increase by 1.
+ */
+#define XFS_IEXT_SWAP_RMAP_CNT (1)
+
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
+{
+ return ifp->if_format == XFS_DINODE_FMT_EXTENTS ||
+ ifp->if_format == XFS_DINODE_FMT_BTREE;
+}
+
+static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return 0;
+ return ifp->if_nextents;
+}
+
+static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return XFS_DINODE_FMT_EXTENTS;
+ return ifp->if_format;
+}
+
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents);
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
+int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
+int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
+void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+ struct xfs_inode_log_item *, int);
+void xfs_idestroy_fork(struct xfs_ifork *ifp);
+void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+ int whichfork);
+void xfs_iroot_realloc(struct xfs_inode *, int, int);
+int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+ int);
+void xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
+ const void *data, int64_t size);
+
+xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *, int);
+void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
+ int);
+void xfs_iext_destroy(struct xfs_ifork *);
+
+bool xfs_iext_lookup_extent(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t bno,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+bool xfs_iext_lookup_extent_before(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+bool xfs_iext_get_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+void xfs_iext_update_extent(struct xfs_inode *ip, int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+
+void xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *);
+
+static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_next(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_prev(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+/*
+ * Return the extent after cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_next(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+/*
+ * Return the extent before cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_prev(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+#define for_each_xfs_iext(ifp, ext, got) \
+ for (xfs_iext_first((ifp), (ext)); \
+ xfs_iext_get_extent((ifp), (ext), (got)); \
+ xfs_iext_next((ifp), (ext)))
+
+extern struct kmem_cache *xfs_ifork_cache;
+
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
+int xfs_ifork_verify_local_data(struct xfs_inode *ip);
+int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
+int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
+ int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
+
+/* returns true if the fork has extents but they are not read in yet. */
+static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+{
+ return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+}
+
+#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644
index 000000000..f13e0809d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -0,0 +1,993 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS 2
+#define XLOG_MAX_ICLOGS 8
+#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
+#define XLOG_VERSION_1 1
+#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
+
+#define XLOG_HEADER_SIZE 512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR 3
+
+#define XLOG_REC_SHIFT(log) \
+ BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+ BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+ return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+ if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+ return be32_to_cpu(*((__be32 *)ptr + 1));
+ else
+ return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION 0x69
+#define XFS_LOG 0xaa
+
+#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
+
+/*
+ * Log item for unmount records.
+ *
+ * The unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now; see xfs_log_unmount_write.
+ */
+struct xfs_unmount_log_format {
+ uint16_t magic; /* XLOG_UNMOUNT_TYPE */
+ uint16_t pad1;
+ uint32_t pad2; /* may as well make it 64 bits */
+};
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT 1
+#define XLOG_REG_TYPE_BCHUNK 2
+#define XLOG_REG_TYPE_EFI_FORMAT 3
+#define XLOG_REG_TYPE_EFD_FORMAT 4
+#define XLOG_REG_TYPE_IFORMAT 5
+#define XLOG_REG_TYPE_ICORE 6
+#define XLOG_REG_TYPE_IEXT 7
+#define XLOG_REG_TYPE_IBROOT 8
+#define XLOG_REG_TYPE_ILOCAL 9
+#define XLOG_REG_TYPE_IATTR_EXT 10
+#define XLOG_REG_TYPE_IATTR_BROOT 11
+#define XLOG_REG_TYPE_IATTR_LOCAL 12
+#define XLOG_REG_TYPE_QFORMAT 13
+#define XLOG_REG_TYPE_DQUOT 14
+#define XLOG_REG_TYPE_QUOTAOFF 15
+#define XLOG_REG_TYPE_LRHEADER 16
+#define XLOG_REG_TYPE_UNMOUNT 17
+#define XLOG_REG_TYPE_COMMIT 18
+#define XLOG_REG_TYPE_TRANSHDR 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_RUI_FORMAT 21
+#define XLOG_REG_TYPE_RUD_FORMAT 22
+#define XLOG_REG_TYPE_CUI_FORMAT 23
+#define XLOG_REG_TYPE_CUD_FORMAT 24
+#define XLOG_REG_TYPE_BUI_FORMAT 25
+#define XLOG_REG_TYPE_BUD_FORMAT 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS. Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions. Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS 0x01 /* Start a new transaction */
+#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
+#define XLOG_END_TRANS 0x10 /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+ __be32 oh_tid; /* transaction id of operation : 4 b */
+ __be32 oh_len; /* bytes in data region : 4 b */
+ __u8 oh_clientid; /* who sent me this : 1 b */
+ __u8 oh_flags; /* : 1 b */
+ __u16 oh_res2; /* 32 bit align : 2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN 0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE 3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+ __be32 h_magicno; /* log record (LR) identifier : 4 */
+ __be32 h_cycle; /* write cycle of log : 4 */
+ __be32 h_version; /* LR version : 4 */
+ __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
+ __be64 h_lsn; /* lsn of this LR : 8 */
+ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
+ __le32 h_crc; /* crc of log record : 4 */
+ __be32 h_prev_block; /* block number to previous LR : 4 */
+ __be32 h_num_logops; /* number of log operations in this LR : 4 */
+ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+ /* new fields */
+ __be32 h_fmt; /* format of log record : 4 */
+ uuid_t h_fs_uuid; /* uuid of FS : 16 */
+ __be32 h_size; /* iclog size : 4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+ __be32 xh_cycle; /* write cycle of log : 4 */
+ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+ xlog_rec_header_t hic_header;
+ xlog_rec_ext_header_t hic_xheader;
+ char hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+ void *i_addr; /* beginning address of region */
+ int i_len; /* length in bytes of region */
+ uint i_type; /* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+ uint th_magic; /* magic number */
+ uint th_type; /* transaction type */
+ int32_t th_tid; /* transaction id (unused) */
+ uint th_num_items; /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
+
+/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT 40
+
+/*
+ * Log item types.
+ */
+#define XFS_LI_EFI 0x1236
+#define XFS_LI_EFD 0x1237
+#define XFS_LI_IUNLINK 0x1238
+#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT 0x123d
+#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_ICREATE 0x123f
+#define XFS_LI_RUI 0x1240 /* rmap update intent */
+#define XFS_LI_RUD 0x1241
+#define XFS_LI_CUI 0x1242 /* refcount update intent */
+#define XFS_LI_CUD 0x1243
+#define XFS_LI_BUI 0x1244 /* bmbt update intent */
+#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
+
+#define XFS_LI_TYPE_DESC \
+ { XFS_LI_EFI, "XFS_LI_EFI" }, \
+ { XFS_LI_EFD, "XFS_LI_EFD" }, \
+ { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
+ { XFS_LI_INODE, "XFS_LI_INODE" }, \
+ { XFS_LI_BUF, "XFS_LI_BUF" }, \
+ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
+ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
+ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \
+ { XFS_LI_RUI, "XFS_LI_RUI" }, \
+ { XFS_LI_RUD, "XFS_LI_RUD" }, \
+ { XFS_LI_CUI, "XFS_LI_CUI" }, \
+ { XFS_LI_CUD, "XFS_LI_CUD" }, \
+ { XFS_LI_BUI, "XFS_LI_BUI" }, \
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log. The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field. Changes to this structure
+ * must be added on to the end.
+ */
+struct xfs_inode_log_format {
+ uint16_t ilf_type; /* inode log item type */
+ uint16_t ilf_size; /* size of this item */
+ uint32_t ilf_fields; /* flags for fields logged */
+ uint16_t ilf_asize; /* size of attr d/ext/root */
+ uint16_t ilf_dsize; /* size of data/ext/root */
+ uint32_t ilf_pad; /* pad for 64 bit boundary */
+ uint64_t ilf_ino; /* inode number */
+ union {
+ uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uint8_t __pad[16]; /* unused */
+ } ilf_u;
+ int64_t ilf_blkno; /* blkno of inode buffer */
+ int32_t ilf_len; /* len of inode buffer */
+ int32_t ilf_boffset; /* off of inode in buffer */
+};
+
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
+ uint16_t ilf_type; /* inode log item type */
+ uint16_t ilf_size; /* size of this item */
+ uint32_t ilf_fields; /* flags for fields logged */
+ uint16_t ilf_asize; /* size of attr d/ext/root */
+ uint16_t ilf_dsize; /* size of data/ext/root */
+ uint64_t ilf_ino; /* inode number */
+ union {
+ uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uint8_t __pad[16]; /* unused */
+ } ilf_u;
+ int64_t ilf_blkno; /* blkno of inode buffer */
+ int32_t ilf_len; /* len of inode buffer */
+ int32_t ilf_boffset; /* off of inode in buffer */
+} __attribute__((packed));
+
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
+#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
+#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
+#define XFS_ILOG_DEV 0x010 /* log the dev field */
+#define XFS_ILOG_UUID 0x020 /* added long ago, but never used */
+#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
+#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core. Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP 0x4000
+
+#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
+
+#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT)
+
+#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+ XFS_ILOG_DEV | XFS_ILOG_ADATA | \
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+ XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef uint64_t xfs_log_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_log_legacy_timestamp {
+ int32_t t_sec; /* timestamp seconds */
+ int32_t t_nsec; /* timestamp nanoseconds */
+};
+
+/*
+ * Define the format of the inode core that is logged. This structure must be
+ * kept identical to struct xfs_dinode except for the endianness annotations.
+ */
+struct xfs_log_dinode {
+ uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ uint16_t di_mode; /* mode and type of file */
+ int8_t di_version; /* inode version */
+ int8_t di_format; /* format of di_c data */
+ uint8_t di_pad3[2]; /* unused in v2/3 inodes */
+ uint32_t di_uid; /* owner's user id */
+ uint32_t di_gid; /* owner's group id */
+ uint32_t di_nlink; /* number of links to file */
+ uint16_t di_projid_lo; /* lower part of owner's project id */
+ uint16_t di_projid_hi; /* higher part of owner's project id */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
+ xfs_log_timestamp_t di_atime; /* time last accessed */
+ xfs_log_timestamp_t di_mtime; /* time last modified */
+ xfs_log_timestamp_t di_ctime; /* time created/inode modified */
+ xfs_fsize_t di_size; /* number of bytes in file */
+ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
+ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ int8_t di_aformat; /* format of attr fork's data */
+ uint32_t di_dmevmask; /* DMIG event mask */
+ uint16_t di_dmstate; /* DMIG state info */
+ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+ uint32_t di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ uint32_t di_crc; /* CRC of the inode */
+ uint64_t di_changecount; /* number of attribute changes */
+
+ /*
+ * The LSN we write to this field during formatting is not a reflection
+ * of the current on-disk LSN. It should never be used for recovery
+ * sequencing, nor should it be recovered into the on-disk inode at all.
+ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+ * for details.
+ */
+ xfs_lsn_t di_lsn;
+
+ uint64_t di_flags2; /* more random flags */
+ uint32_t di_cowextsize; /* basic cow extent size for file */
+ uint8_t di_pad2[12]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_log_timestamp_t di_crtime; /* time created */
+ xfs_ino_t di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+};
+
+#define xfs_log_dinode_size(mp) \
+ (xfs_has_v3inodes((mp)) ? \
+ sizeof(struct xfs_log_dinode) : \
+ offsetof(struct xfs_log_dinode, di_next_unlinked))
+
+/*
+ * Buffer Log Format definitions
+ *
+ * These are the physical dirty bitmap definitions for the log format structure.
+ */
+#define XFS_BLF_CHUNK 128
+#define XFS_BLF_SHIFT 7
+#define BIT_TO_WORD_SHIFT 5
+#define NBWORD (NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLF_INODE_BUF (1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLF_CANCEL (1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLF_UDQUOT_BUF (1<<2)
+#define XFS_BLF_PDQUOT_BUF (1<<3)
+#define XFS_BLF_GDQUOT_BUF (1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the log. The data
+ * map describes which 128 byte chunks of the buffer have been logged.
+ *
+ * The placement of blf_map_size causes blf_data_map to start at an odd
+ * multiple of sizeof(unsigned int) offset within the struct. Because the data
+ * bitmap size will always be an even number, the end of the data_map (and
+ * therefore the structure) will also be at an odd multiple of sizeof(unsigned
+ * int). Some 64-bit compilers will insert padding at the end of the struct to
+ * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore,
+ * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and
+ * keep the structure size consistent between 32-bit and 64-bit platforms.
+ */
+#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1)
+
+typedef struct xfs_buf_log_format {
+ unsigned short blf_type; /* buf log item type indicator */
+ unsigned short blf_size; /* size of this item */
+ unsigned short blf_flags; /* misc state */
+ unsigned short blf_len; /* number of blocks in this buf */
+ int64_t blf_blkno; /* starting blkno of this buf */
+ unsigned int blf_map_size; /* used size of data bitmap in words */
+ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS 5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+ XFS_BLFT_UNKNOWN_BUF = 0,
+ XFS_BLFT_UDQUOT_BUF,
+ XFS_BLFT_PDQUOT_BUF,
+ XFS_BLFT_GDQUOT_BUF,
+ XFS_BLFT_BTREE_BUF,
+ XFS_BLFT_AGF_BUF,
+ XFS_BLFT_AGFL_BUF,
+ XFS_BLFT_AGI_BUF,
+ XFS_BLFT_DINO_BUF,
+ XFS_BLFT_SYMLINK_BUF,
+ XFS_BLFT_DIR_BLOCK_BUF,
+ XFS_BLFT_DIR_DATA_BUF,
+ XFS_BLFT_DIR_FREE_BUF,
+ XFS_BLFT_DIR_LEAF1_BUF,
+ XFS_BLFT_DIR_LEAFN_BUF,
+ XFS_BLFT_DA_NODE_BUF,
+ XFS_BLFT_ATTR_LEAF_BUF,
+ XFS_BLFT_ATTR_RMT_BUF,
+ XFS_BLFT_SB_BUF,
+ XFS_BLFT_RTBITMAP_BUF,
+ XFS_BLFT_RTSUMMARY_BUF,
+ XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+ ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+ blf->blf_flags &= ~XFS_BLFT_MASK;
+ blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+ return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+ xfs_fsblock_t ext_start;
+ xfs_extlen_t ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+ uint64_t ext_start;
+ uint32_t ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+ uint64_t ext_start;
+ uint32_t ext_len;
+ uint32_t ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log. The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
+ xfs_extent_t efi_extents[]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+static inline size_t
+xfs_efi_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
+typedef struct xfs_efi_log_format_32 {
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
+ xfs_extent_32_t efi_extents[]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+static inline size_t
+xfs_efi_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
+typedef struct xfs_efi_log_format_64 {
+ uint16_t efi_type; /* efi log item type */
+ uint16_t efi_size; /* size of this item */
+ uint32_t efi_nextents; /* # extents to free */
+ uint64_t efi_id; /* efi identifier */
+ xfs_extent_64_t efi_extents[]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+static inline size_t
+xfs_efi_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log. The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_t efd_extents[]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+static inline size_t
+xfs_efd_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
+typedef struct xfs_efd_log_format_32 {
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_32_t efd_extents[]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+static inline size_t
+xfs_efd_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
+typedef struct xfs_efd_log_format_64 {
+ uint16_t efd_type; /* efd log item type */
+ uint16_t efd_size; /* size of this item */
+ uint32_t efd_nextents; /* # of extents freed */
+ uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_64_t efd_extents[]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+static inline size_t
+xfs_efd_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
+/*
+ * RUI/RUD (reverse mapping) log format definitions
+ */
+struct xfs_map_extent {
+ uint64_t me_owner;
+ uint64_t me_startblock;
+ uint64_t me_startoff;
+ uint32_t me_len;
+ uint32_t me_flags;
+};
+
+/* rmap me_flags: upper bits are flags, lower byte is type code */
+#define XFS_RMAP_EXTENT_MAP 1
+#define XFS_RMAP_EXTENT_MAP_SHARED 2
+#define XFS_RMAP_EXTENT_UNMAP 3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED 4
+#define XFS_RMAP_EXTENT_CONVERT 5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED 6
+#define XFS_RMAP_EXTENT_ALLOC 7
+#define XFS_RMAP_EXTENT_FREE 8
+#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30)
+#define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29)
+
+#define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \
+ XFS_RMAP_EXTENT_ATTR_FORK | \
+ XFS_RMAP_EXTENT_BMBT_BLOCK | \
+ XFS_RMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an rui log item in the
+ * log. The rui_extents field is a variable size array whose
+ * size is given by rui_nextents.
+ */
+struct xfs_rui_log_format {
+ uint16_t rui_type; /* rui log item type */
+ uint16_t rui_size; /* size of this item */
+ uint32_t rui_nextents; /* # extents to free */
+ uint64_t rui_id; /* rui identifier */
+ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
+};
+
+static inline size_t
+xfs_rui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_rui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an rud log item in the
+ * log. The rud_extents array is a variable size array whose
+ * size is given by rud_nextents;
+ */
+struct xfs_rud_log_format {
+ uint16_t rud_type; /* rud log item type */
+ uint16_t rud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t rud_rui_id; /* id of corresponding rui */
+};
+
+/*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+ uint64_t pe_startblock;
+ uint32_t pe_len;
+ uint32_t pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log. The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+ uint16_t cui_type; /* cui log item type */
+ uint16_t cui_size; /* size of this item */
+ uint32_t cui_nextents; /* # extents to free */
+ uint64_t cui_id; /* cui identifier */
+ struct xfs_phys_extent cui_extents[]; /* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_cui_log_format) +
+ nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log. The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+ uint16_t cud_type; /* cud log item type */
+ uint16_t cud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t cud_cui_id; /* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
+ XFS_BMAP_EXTENT_ATTR_FORK | \
+ XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log. The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+ uint16_t bui_type; /* bui log item type */
+ uint16_t bui_size; /* size of this item */
+ uint32_t bui_nextents; /* # extents to free */
+ uint64_t bui_id; /* bui identifier */
+ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_bui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log. The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+ uint16_t bud_type; /* bud log item type */
+ uint16_t bud_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t bud_bui_id; /* id of corresponding bui */
+};
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+ uint16_t qlf_type; /* dquot log item type */
+ uint16_t qlf_size; /* size of this item */
+ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
+ int64_t qlf_blkno; /* blkno of dquot buffer */
+ int32_t qlf_len; /* len of dquot buffer */
+ uint32_t qlf_boffset; /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+ unsigned short qf_type; /* quotaoff log item type */
+ unsigned short qf_size; /* size of this item */
+ unsigned int qf_flags; /* USR and/or GRP */
+ char qf_pad[12]; /* padding for future */
+} xfs_qoff_logformat_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT \
+ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD \
+ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD \
+ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+ XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+ uint16_t icl_type; /* type of log format structure */
+ uint16_t icl_size; /* size of log format structure */
+ __be32 icl_ag; /* ag being allocated in */
+ __be32 icl_agbno; /* start block of inode range */
+ __be32 icl_count; /* number of inodes to initialise */
+ __be32 icl_isize; /* size of inodes */
+ __be32 icl_length; /* length of extent to initialise */
+ __be32 icl_gen; /* inode generation number to use */
+};
+
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should
+ * never have any other bits set.
+ */
+#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_INCOMPLETE)
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_filter;/* attr filter flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644
index 000000000..2420865f3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to
+ * define how recovery should work for that type of log item.
+ */
+struct xlog_recover_item;
+
+/* Sorting hat for log items as they're read in. */
+enum xlog_recover_reorder {
+ XLOG_REORDER_BUFFER_LIST,
+ XLOG_REORDER_ITEM_LIST,
+ XLOG_REORDER_INODE_BUFFER_LIST,
+ XLOG_REORDER_CANCEL_LIST,
+};
+
+struct xlog_recover_item_ops {
+ uint16_t item_type; /* XFS_LI_* type code. */
+
+ /*
+ * Help sort recovered log items into the order required to replay them
+ * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do
+ * not have to supply a function here. See the comment preceding
+ * xlog_recover_reorder_trans for more details about what the return
+ * values mean.
+ */
+ enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item);
+
+ /* Start readahead for pass2, if provided. */
+ void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item);
+
+ /* Do whatever work we need to do for pass1, if provided. */
+ int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item);
+
+ /*
+ * This function should do whatever work is needed for pass2 of log
+ * recovery, if provided.
+ *
+ * If the recovered item is an intent item, this function should parse
+ * the recovered item to construct an in-core log intent item and
+ * insert it into the AIL. The in-core log intent item should have 1
+ * refcount so that the item is freed either (a) when we commit the
+ * recovered log item for the intent-done item; (b) replay the work and
+ * log a new intent-done item; or (c) recovery fails and we have to
+ * abort.
+ *
+ * If the recovered item is an intent-done item, this function should
+ * parse the recovered item to find the id of the corresponding intent
+ * log item. Next, it should find the in-core log intent item in the
+ * AIL and release it.
+ */
+ int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list,
+ struct xlog_recover_item *item, xfs_lsn_t lsn);
+};
+
+extern const struct xlog_recover_item_ops xlog_icreate_item_ops;
+extern const struct xlog_recover_item_ops xlog_buf_item_ops;
+extern const struct xlog_recover_item_ops xlog_inode_item_ops;
+extern const struct xlog_recover_item_ops xlog_dquot_item_ops;
+extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops;
+extern const struct xlog_recover_item_ops xlog_bui_item_ops;
+extern const struct xlog_recover_item_ops xlog_bud_item_ops;
+extern const struct xlog_recover_item_ops xlog_efi_item_ops;
+extern const struct xlog_recover_item_ops xlog_efd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rud_item_ops;
+extern const struct xlog_recover_item_ops xlog_cui_item_ops;
+extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS 4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+ ((((uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0]. Additional buffers follow.
+ */
+struct xlog_recover_item {
+ struct list_head ri_list;
+ int ri_cnt; /* count of regions found */
+ int ri_total; /* total regions */
+ struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ const struct xlog_recover_item_ops *ri_ops;
+};
+
+struct xlog_recover {
+ struct hlist_node r_list;
+ xlog_tid_t r_log_tid; /* log's transaction id */
+ xfs_trans_header_t r_theader; /* trans header for partial */
+ int r_state; /* not needed */
+ xfs_lsn_t r_lsn; /* xact lsn */
+ struct list_head r_itemq; /* q for items */
+};
+
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
+
+#define XLOG_RECOVER_CRCPASS 0
+#define XLOG_RECOVER_PASS1 1
+#define XLOG_RECOVER_PASS2 2
+
+void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
+ const struct xfs_buf_ops *ops);
+bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
+
+int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
+ struct xfs_inode **ipp);
+void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
+ uint64_t intent_id);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
+
+#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000..9975b93a7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+ struct xfs_mount *mp)
+{
+ int size;
+ int nblks;
+
+ size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+ MAXNAMELEN - 1;
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ nblks += XFS_B_TO_FSB(mp, size);
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+ return M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+void
+xfs_log_get_max_trans_res(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *max_resp)
+{
+ struct xfs_trans_resv resv = {};
+ struct xfs_trans_res *resp;
+ struct xfs_trans_res *end_resp;
+ unsigned int i;
+ int log_space = 0;
+ int attr_space;
+
+ attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
+ int tmp = resp->tr_logcount > 1 ?
+ resp->tr_logres * resp->tr_logcount :
+ resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
+ if (log_space < tmp) {
+ log_space = tmp;
+ *max_resp = *resp; /* struct copy */
+ }
+ }
+
+ if (attr_space > log_space) {
+ *max_resp = resv.tr_attrsetm; /* struct copy */
+ max_resp->tr_logres = attr_space;
+ }
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans_res tres = {0};
+ int max_logres;
+ int min_logblks = 0;
+ int lsunit = 0;
+
+ xfs_log_get_max_trans_res(mp, &tres);
+
+ max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+ if (tres.tr_logcount > 1)
+ max_logres *= tres.tr_logcount;
+
+ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
+ lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+ /*
+ * Two factors should be taken into account for calculating the minimum
+ * log space.
+ * 1) The fundamental limitation is that no single transaction can be
+ * larger than half size of the log.
+ *
+ * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+ * define, which is set to 3. That means we can definitely fit
+ * maximally sized 2 transactions in the log. We'll use this same
+ * value here.
+ *
+ * 2) If the lsunit option is specified, a transaction requires 2 LSU
+ * for the reservation because there are two log writes that can
+ * require padding - the transaction data and the commit record which
+ * are written separately and both can require padding to the LSU.
+ * Consider that we can have an active CIL reservation holding 2*LSU,
+ * but the CIL is not over a push threshold, in this case, if we
+ * don't have enough log space for at one new transaction, which
+ * includes another 2*LSU in the reservation, we will run into dead
+ * loop situation in log space grant procedure. i.e.
+ * xlog_grant_head_wait().
+ *
+ * Hence the log size needs to be able to contain two maximally sized
+ * and padded transactions, which is (2 * (2 * LSU + maxlres)).
+ *
+ * Also, the log size should be a multiple of the log stripe unit, round
+ * it up to lsunit boundary if lsunit is specified.
+ */
+ if (lsunit) {
+ min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+ 2 * lsunit;
+ } else
+ min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+ min_logblks *= XFS_MIN_LOG_FACTOR;
+
+ return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644
index 000000000..cb035da3f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef uint64_t xfs_qcnt_t;
+
+typedef uint8_t xfs_dqtype_t;
+
+#define XFS_DQTYPE_STRINGS \
+ { XFS_DQTYPE_USER, "USER" }, \
+ { XFS_DQTYPE_PROJ, "PROJ" }, \
+ { XFS_DQTYPE_GROUP, "GROUP" }, \
+ { XFS_DQTYPE_BIGTIME, "BIGTIME" }
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
+
+#define XFS_DQFLAG_STRINGS \
+ { XFS_DQFLAG_DIRTY, "DIRTY" }, \
+ { XFS_DQFLAG_FREEING, "FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp) \
+ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL \
+ (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+
+extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
+ struct xfs_disk_dquot *ddq, xfs_dqid_t id);
+extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
+ struct xfs_dqblk *dqb, xfs_dqid_t id);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
+ xfs_dqid_t id, xfs_dqtype_t type);
+
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+ __be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+
+#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 000000000..3f34bafe1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1915 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+struct kmem_cache *xfs_refcount_intent_cache;
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+ XFS_REFCOUNT_ADJUST_INCREASE = 1,
+ XFS_REFCOUNT_ADJUST_DECREASE = -1,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC = 0,
+ XFS_REFCOUNT_ADJUST_COW_FREE = -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
+ XFS_LOOKUP_GE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Look up the first record equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_eq(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+void
+xfs_refcount_btrec_to_irec(
+ const union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = be32_to_cpu(rec->refc.rc_startblock);
+ if (start & XFS_REFC_COWFLAG) {
+ start &= ~XFS_REFC_COWFLAG;
+ irec->rc_domain = XFS_REFC_DOMAIN_COW;
+ } else {
+ irec->rc_domain = XFS_REFC_DOMAIN_SHARED;
+ }
+
+ irec->rc_startblock = start;
+ irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+ irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *stat)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ xfs_refcount_btrec_to_irec(rec, irec);
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+ goto out_bad_rec;
+
+ if (!xfs_refcount_check_domain(irec))
+ goto out_bad_rec;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
+ goto out_bad_rec;
+
+ if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+ goto out_bad_rec;
+
+ trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
+ return 0;
+
+out_bad_rec:
+ xfs_warn(mp,
+ "Refcount BTree record corruption in AG %d detected!",
+ pag->pag_agno);
+ xfs_warn(mp,
+ "Start block 0x%x, block count 0x%x, references 0x%x",
+ irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec)
+{
+ union xfs_btree_rec rec;
+ uint32_t start;
+ int error;
+
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec.refc.rc_startblock = cpu_to_be32(start);
+ rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+ rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_refcount_update_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int
+xfs_refcount_insert(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *i)
+{
+ int error;
+
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
+ cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+ cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+ cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ cur->bc_rec.rc.rc_domain = irec->rc_domain;
+
+ error = xfs_btree_insert(cur, i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+out_error:
+ if (error)
+ trace_xfs_refcount_insert_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+ struct xfs_btree_cur *cur,
+ int *i)
+{
+ struct xfs_refcount_irec irec;
+ int found_rec;
+ int error;
+
+ error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
+ error = xfs_btree_delete(cur, i);
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (error)
+ goto out_error;
+ error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock,
+ &found_rec);
+out_error:
+ if (error)
+ trace_xfs_refcount_delete_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks. In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+---------
+ * 2 | | 3 | 4 | |17| 55 | 10
+ * ----+ +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted. For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+----+----
+ * 2 | | 3 | 2 | |17| 55 | 10 | 10
+ * ----+ +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once. Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ * <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent. Now we
+ * have a left, current, and right extent. If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so. If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those. In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 3 | 2 |1|17| 55 | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2. If we were
+ * incrementing, the end result looks like this:
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 4 | 3 |2|18| 56 | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+ +--+--------+----+----
+ * 2 | | 2 | |16| 54 | 9 | 10
+ * ----+ +---+ +--+--------+----+----
+ * DDDD 111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec rcext, tmp;
+ int found_rec;
+ int error;
+
+ *shape_changed = false;
+ error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (rcext.rc_domain != domain)
+ return 0;
+ if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+ return 0;
+
+ *shape_changed = true;
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ &rcext, agbno);
+
+ /* Establish the right extent. */
+ tmp = rcext;
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+ error = xfs_refcount_update(cur, &tmp);
+ if (error)
+ goto out_error;
+
+ /* Insert the left extent. */
+ tmp = rcext;
+ tmp.rc_blockcount = agbno - rcext.rc_startblock;
+ error = xfs_refcount_insert(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ return error;
+
+out_error:
+ trace_xfs_refcount_split_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *center,
+ struct xfs_refcount_irec *right,
+ unsigned long long extlen,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, left, center, right);
+
+ ASSERT(left->rc_domain == center->rc_domain);
+ ASSERT(right->rc_domain == center->rc_domain);
+
+ /*
+ * Make sure the center and right extents are not in the btree.
+ * If the center extent was synthesized, the first delete call
+ * removes the right extent and we skip the second deletion.
+ * If center and right were in the btree, then the first delete
+ * call removes the center and the second one removes the right
+ * extent.
+ */
+ error = xfs_refcount_lookup_ge(cur, center->rc_domain,
+ center->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (center->rc_refcount > 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ left->rc_blockcount = extlen;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *aglen = 0;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, left, cleft);
+
+ ASSERT(left->rc_domain == cleft->rc_domain);
+
+ /* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+ if (cleft->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cleft->rc_domain,
+ cleft->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ left->rc_blockcount += cleft->rc_blockcount;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *agbno += cleft->rc_blockcount;
+ *aglen -= cleft->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, cright, right);
+
+ ASSERT(right->rc_domain == cright->rc_domain);
+
+ /*
+ * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+ * remove it.
+ */
+ if (cright->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cright->rc_domain,
+ cright->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+
+ /* Enlarge the right extent. */
+ error = xfs_refcount_lookup_le(cur, right->rc_domain,
+ right->rc_startblock, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ right->rc_startblock -= cright->rc_blockcount;
+ right->rc_blockcount += cright->rc_blockcount;
+ error = xfs_refcount_update(cur, right);
+ if (error)
+ goto out_error;
+
+ *aglen -= cright->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find the left extent and the one after it (cleft). This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (tmp.rc_domain != domain)
+ return 0;
+ if (xfs_refc_next(&tmp) != agbno)
+ return 0;
+ /* We have a left extent; retrieve (or invent) the next right one */
+ *left = tmp;
+
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
+ /* if tmp starts at the end of our range, just use that */
+ if (tmp.rc_startblock == agbno)
+ *cleft = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the start of the
+ * range we're interested in (refcount == 1) so
+ * synthesize the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = min(aglen,
+ tmp.rc_startblock - agbno);
+ cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
+ }
+ } else {
+not_found:
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = aglen;
+ cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
+ }
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ left, cleft, agbno);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright). This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (tmp.rc_domain != domain)
+ return 0;
+ if (tmp.rc_startblock != agbno + aglen)
+ return 0;
+ /* We have a right extent; retrieve (or invent) the next left one */
+ *right = tmp;
+
+ error = xfs_btree_decrement(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
+ /* if tmp ends at the end of our range, just use that */
+ if (xfs_refc_next(&tmp) == agbno + aglen)
+ *cright = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the end of the
+ * range we're interested in (refcount == 1) so
+ * create the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+ cright->rc_blockcount = right->rc_startblock -
+ cright->rc_startblock;
+ cright->rc_refcount = 1;
+ cright->rc_domain = domain;
+ }
+ } else {
+not_found:
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cright->rc_startblock = agbno;
+ cright->rc_blockcount = aglen;
+ cright->rc_refcount = 1;
+ cright->rc_domain = domain;
+ }
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ cright, right, agbno + aglen);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adjust,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec left = {0}, cleft = {0};
+ struct xfs_refcount_irec cright = {0}, right = {0};
+ int error;
+ unsigned long long ulen;
+ bool cequal;
+
+ *shape_changed = false;
+ /*
+ * Find the extent just below agbno [left], just above agbno [cleft],
+ * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+ * [right].
+ */
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain,
+ *agbno, *aglen);
+ if (error)
+ return error;
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, domain,
+ *agbno, *aglen);
+ if (error)
+ return error;
+
+ /* No left or right extent to merge; exit. */
+ if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+ return 0;
+
+ cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+ (cleft.rc_blockcount == cright.rc_blockcount);
+
+ /* Try to merge left, cleft, and right. cleft must == cright. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+ right.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+ xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ right.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+ &right, ulen, aglen);
+ }
+
+ /* Try to merge left and cleft. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+ agbno, aglen);
+ if (error)
+ return error;
+
+ /*
+ * If we just merged left + cleft and cleft == cright,
+ * we no longer have a cright to merge with right. We're done.
+ */
+ if (cequal)
+ return 0;
+ }
+
+ /* Try to merge cright and right. */
+ ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+ if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+ right.rc_refcount == cright.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_right_extent(cur, &right, &cright,
+ aglen);
+ }
+
+ return 0;
+}
+
+/*
+ * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+ struct xfs_btree_cur *cur)
+{
+ unsigned long overhead;
+
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
+ overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+ /*
+ * Only allow 2 refcount extent updates per transaction if the
+ * refcount continue update "error" has been injected.
+ */
+ if (cur->bc_ag.refc.nr_ops > 2 &&
+ XFS_TEST_ERROR(false, cur->bc_mp,
+ XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+ return false;
+
+ if (cur->bc_ag.refc.nr_ops == 0)
+ return true;
+ else if (overhead > cur->bc_tp->t_log_res)
+ return false;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
+/*
+ * Adjust the refcounts of middle extents. At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges. Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adj)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+ xfs_fsblock_t fsbno;
+
+ /* Merging did all the work already. */
+ if (*aglen == 0)
+ return 0;
+
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno,
+ &found_rec);
+ if (error)
+ goto out_error;
+
+ while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ }
+
+ /*
+ * Deal with a hole in the refcount tree; if a file maps to
+ * these blocks and there's no refcountbt record, pretend that
+ * there is one with refcount == 1.
+ */
+ if (ext.rc_startblock != *agbno) {
+ tmp.rc_startblock = *agbno;
+ tmp.rc_blockcount = min(*aglen,
+ ext.rc_startblock - *agbno);
+ tmp.rc_refcount = 1 + adj;
+ tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, &tmp);
+
+ /*
+ * Either cover the hole (increment) or
+ * delete the range (decrement).
+ */
+ cur->bc_ag.refc.nr_ops++;
+ if (tmp.rc_refcount) {
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno,
+ tmp.rc_startblock);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL);
+ }
+
+ (*agbno) += tmp.rc_blockcount;
+ (*aglen) -= tmp.rc_blockcount;
+
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /* Move the cursor to the start of ext. */
+ error = xfs_refcount_lookup_ge(cur,
+ XFS_REFC_DOMAIN_SHARED, *agbno,
+ &found_rec);
+ if (error)
+ goto out_error;
+ }
+
+ /*
+ * A previous step trimmed agbno/aglen such that the end of the
+ * range would not be in the middle of the record. If this is
+ * no longer the case, something is seriously wrong with the
+ * btree. Make sure we never feed the synthesized record into
+ * the processing loop below.
+ */
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
+ XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /*
+ * Adjust the reference count and either update the tree
+ * (incr) or free the blocks (decr).
+ */
+ if (ext.rc_refcount == MAXREFCOUNT)
+ goto skip;
+ ext.rc_refcount += adj;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
+ if (ext.rc_refcount > 1) {
+ error = xfs_refcount_update(cur, &ext);
+ if (error)
+ goto out_error;
+ } else if (ext.rc_refcount == 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ goto advloop;
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno,
+ ext.rc_startblock);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL);
+ }
+
+skip:
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+
+advloop:
+ (*agbno) += ext.rc_blockcount;
+ (*aglen) -= ext.rc_blockcount;
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *new_agbno,
+ xfs_extlen_t *new_aglen,
+ enum xfs_refc_adjust_op adj)
+{
+ bool shape_changed;
+ int shape_changes = 0;
+ int error;
+
+ *new_agbno = agbno;
+ *new_aglen = aglen;
+ if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ agbno, aglen);
+ else
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ agbno, aglen);
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+ new_agbno, new_aglen, adj, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+ if (shape_changes)
+ cur->bc_ag.refc.shape_changes++;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_ag.agbp;
+ xfs_btree_del_cursor(rcur, error);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Set up a continuation a deferred refcount operation by updating the intent.
+ * Checks to make sure we're not going to run off the end of the AG.
+ */
+static inline int
+xfs_refcount_continue_op(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t startblock,
+ xfs_agblock_t new_agbno,
+ xfs_extlen_t new_len,
+ xfs_fsblock_t *new_fsbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ return -EFSCORRUPTED;
+
+ *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+
+ ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+
+ return 0;
+}
+
+/*
+ * Process one of the deferred refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+ struct xfs_trans *tp,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ xfs_agblock_t bno;
+ xfs_agblock_t new_agbno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+ type, XFS_FSB_TO_AGBNO(mp, startblock),
+ blockcount);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
+ nr_ops = rcur->bc_ag.refc.nr_ops;
+ shape_changes = rcur->bc_ag.refc.shape_changes;
+ xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+ &agbp);
+ if (error)
+ goto out_drop;
+
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ rcur->bc_ag.refc.nr_ops = nr_ops;
+ rcur->bc_ag.refc.shape_changes = shape_changes;
+ }
+ *pcur = rcur;
+
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_free(rcur, bno, blockcount);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (!error && *new_len > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
+ bno, blockcount, new_agbno, *new_len);
+out_drop:
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static void
+__xfs_refcount_add(
+ struct xfs_trans *tp,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount)
+{
+ struct xfs_refcount_intent *ri;
+
+ trace_xfs_refcount_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock),
+ type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+ blockcount);
+
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_startblock = startblock;
+ ri->ri_blockcount = blockcount;
+
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+void
+xfs_refcount_increase_extent(
+ struct xfs_trans *tp,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_has_reflink(tp->t_mountp))
+ return;
+
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+void
+xfs_refcount_decrease_extent(
+ struct xfs_trans *tp,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_has_reflink(tp->t_mountp))
+ return;
+
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen. If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find. If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *fbno,
+ xfs_extlen_t *flen,
+ bool find_end_of_shared)
+{
+ struct xfs_refcount_irec tmp;
+ int i;
+ int have;
+ int error;
+
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ agbno, aglen);
+
+ /* By default, skip the whole range */
+ *fbno = NULLAGBLOCK;
+ *flen = 0;
+
+ /* Try to find a refcount extent that crosses the start */
+ error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno,
+ &have);
+ if (error)
+ goto out_error;
+ if (!have) {
+ /* No left extent, look at the next one */
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ }
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
+
+ /* If the extent ends before the start, look at the next one */
+ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
+ }
+
+ /* If the extent starts after the range we want, bail out */
+ if (tmp.rc_startblock >= agbno + aglen)
+ goto done;
+
+ /* We found the start of a shared extent! */
+ if (tmp.rc_startblock < agbno) {
+ tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+ tmp.rc_startblock = agbno;
+ }
+
+ *fbno = tmp.rc_startblock;
+ *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+ if (!find_end_of_shared)
+ goto done;
+
+ /* Otherwise, find the end of this shared extent */
+ while (*fbno + *flen < agbno + aglen) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ break;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED ||
+ tmp.rc_startblock >= agbno + aglen ||
+ tmp.rc_startblock != *fbno + *flen)
+ break;
+ *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+ }
+
+done:
+ trace_xfs_refcount_find_shared_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, *fbno, *flen);
+
+out_error:
+ if (error)
+ trace_xfs_refcount_find_shared_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork. The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file. A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost. This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree. The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1. Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records. These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1. The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings. These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log. Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail. Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time. Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations. These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+
+ if (aglen == 0)
+ return 0;
+
+ /* Find any overlapping refcount records */
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno,
+ &found_rec);
+ if (error)
+ goto out_error;
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
+ ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (!found_rec) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_COW;
+ }
+
+ switch (adj) {
+ case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+ /* Adding a CoW reservation, there should be nothing here. */
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ agbno + aglen > ext.rc_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount = aglen;
+ tmp.rc_refcount = 1;
+ tmp.rc_domain = XFS_REFC_DOMAIN_COW;
+
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, &tmp);
+
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ break;
+ case XFS_REFCOUNT_ADJUST_COW_FREE:
+ /* Removing a CoW reservation, there should be one extent. */
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ ext.rc_refcount = 0;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, &ext);
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj)
+{
+ bool shape_changed;
+ int error;
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno, &shape_changed);
+ if (error)
+ goto out_error;
+
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno,
+ &aglen, adj, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen)
+{
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
+ agbno, aglen);
+
+ /* Add refcount btree reservation */
+ return xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC);
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen)
+{
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
+ agbno, aglen);
+
+ /* Remove refcount btree reservation */
+ return xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_FREE);
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+void
+xfs_refcount_alloc_cow_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!xfs_has_reflink(mp))
+ return;
+
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+
+ /* Add rmap entry */
+ xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+void
+xfs_refcount_free_cow_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!xfs_has_reflink(mp))
+ return;
+
+ /* Remove rmap entry */
+ xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+}
+
+struct xfs_refcount_recovery {
+ struct list_head rr_list;
+ struct xfs_refcount_irec rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct list_head *debris = priv;
+ struct xfs_refcount_recovery *rr;
+
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ be32_to_cpu(rec->refc.rc_refcount) != 1))
+ return -EFSCORRUPTED;
+
+ rr = kmalloc(sizeof(struct xfs_refcount_recovery),
+ GFP_KERNEL | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&rr->rr_list);
+ xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ kfree(rr);
+ return -EFSCORRUPTED;
+ }
+
+ list_add_tail(&rr->rr_list, debris);
+ return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ struct xfs_trans *tp;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_refcount_recovery *rr, *n;
+ struct list_head debris;
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+ xfs_fsblock_t fsb;
+ int error;
+
+ /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
+ if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
+ return -EOPNOTSUPP;
+
+ INIT_LIST_HEAD(&debris);
+
+ /*
+ * In this first part, we use an empty transaction to gather up
+ * all the leftover CoW extents so that we can subsequently
+ * delete them. The empty transaction is used to avoid
+ * a buffer lock deadlock if there happens to be a loop in the
+ * refcountbt because we're allowed to re-grab a buffer that is
+ * already attached to our transaction. When we're done
+ * recording the CoW debris we cancel the (empty) transaction
+ * and everything goes away cleanly.
+ */
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ goto out_trans;
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+
+ /* Find all the leftover CoW staging extents. */
+ memset(&low, 0, sizeof(low));
+ memset(&high, 0, sizeof(high));
+ low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
+ high.rc.rc_startblock = -1U;
+ error = xfs_btree_query_range(cur, &low, &high,
+ xfs_refcount_recover_extent, &debris);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agbp);
+ xfs_trans_cancel(tp);
+ if (error)
+ goto out_free;
+
+ /* Now iterate the list to free the leftovers */
+ list_for_each_entry_safe(rr, n, &debris, rr_list) {
+ /* Set up transaction. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
+ &rr->rr_rrec);
+
+ /* Free the orphan record */
+ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+ rr->rr_rrec.rc_startblock);
+ xfs_refcount_free_cow_extent(tp, fsb,
+ rr->rr_rrec.rc_blockcount);
+
+ /* Free the block. */
+ xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+
+ list_del(&rr->rr_list);
+ kfree(rr);
+ }
+
+ return error;
+out_trans:
+ xfs_trans_cancel(tp);
+out_free:
+ /* Free the leftover list */
+ list_for_each_entry_safe(rr, n, &debris, rr_list) {
+ list_del(&rr->rr_list);
+ kfree(rr);
+ }
+ return error;
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_refcount_has_record(
+ struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool *exists)
+{
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+
+ memset(&low, 0, sizeof(low));
+ low.rc.rc_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rc.rc_startblock = bno + len - 1;
+ low.rc.rc_domain = high.rc.rc_domain = domain;
+
+ return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+ xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+ sizeof(struct xfs_refcount_intent),
+ 0, 0, NULL);
+
+ return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_refcount_intent_cache);
+ xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 000000000..452f30556
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_btree_cur;
+struct xfs_bmbt_irec;
+struct xfs_refcount_irec;
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
+
+static inline uint32_t
+xfs_refcount_encode_startblock(
+ xfs_agblock_t startblock,
+ enum xfs_refc_domain domain)
+{
+ uint32_t start;
+
+ /*
+ * low level btree operations need to handle the generic btree range
+ * query functions (which set rc_domain == -1U), so we check that the
+ * domain is /not/ shared.
+ */
+ start = startblock & ~XFS_REFC_COWFLAG;
+ if (domain != XFS_REFC_DOMAIN_SHARED)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+enum xfs_refcount_intent_type {
+ XFS_REFCOUNT_INCREASE = 1,
+ XFS_REFCOUNT_DECREASE,
+ XFS_REFCOUNT_ALLOC_COW,
+ XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+ struct list_head ri_list;
+ enum xfs_refcount_intent_type ri_type;
+ xfs_extlen_t ri_blockcount;
+ xfs_fsblock_t ri_startblock;
+};
+
+/* Check that the refcount is appropriate for the record domain. */
+static inline bool
+xfs_refcount_check_domain(
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
+ return false;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
+ return false;
+ return true;
+}
+
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
+ struct xfs_bmbt_irec *irec);
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+ struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+ enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+ xfs_extlen_t *flen, bool find_end_of_shared);
+
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ struct xfs_perag *pag);
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs. Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record. We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD 32
+
+extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exists);
+union xfs_btree_rec;
+extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec);
+extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
+
+extern struct kmem_cache *xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
+#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 000000000..e1f789866
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache *xfs_refcountbt_cur_cache;
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_ag.agbp, cur->bc_ag.pag);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = agbp->b_pag;
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_refcount_root = ptr->s;
+ be32_add_cpu(&agf->agf_refcount_level, inc);
+ pag->pagf_refcount_level += inc;
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp,
+ XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_alloc_arg args; /* block allocation args */
+ int error; /* error return value */
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refc_block(args.mp));
+ args.oinfo = XFS_RMAP_OINFO_REFC;
+ args.minlen = args.maxlen = args.prod = 1;
+ args.resv = XFS_AG_RESV_METADATA;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_error;
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ args.agbno, 1);
+ if (args.fsbno == NULLFSBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
+ ASSERT(args.len == 1);
+
+ new->s = cpu_to_be32(args.agbno);
+ be32_add_cpu(&agf->agf_refcount_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+ *stat = 1;
+ return 0;
+
+out_error:
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ int error;
+
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+ be32_add_cpu(&agf->agf_refcount_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
+ XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
+
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+ ptr->s = agf->agf_refcount_root;
+}
+
+STATIC int64_t
+xfs_refcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_refcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC xfs_failaddr_t
+xfs_refcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ xfs_failaddr_t fa;
+ unsigned int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_refcount_level)
+ return __this_address;
+ } else if (level >= mp->m_refc_maxlevels)
+ return __this_address;
+
+ return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_refcountbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_refcountbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+ .name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
+ .verify_read = xfs_refcountbt_read_verify,
+ .verify_write = xfs_refcountbt_write_verify,
+ .verify_struct = xfs_refcountbt_verify,
+};
+
+STATIC int
+xfs_refcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+
+ .dup_cursor = xfs_refcountbt_dup_cursor,
+ .set_root = xfs_refcountbt_set_root,
+ .alloc_block = xfs_refcountbt_alloc_block,
+ .free_block = xfs_refcountbt_free_block,
+ .get_minrecs = xfs_refcountbt_get_minrecs,
+ .get_maxrecs = xfs_refcountbt_get_maxrecs,
+ .init_key_from_rec = xfs_refcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
+ .key_diff = xfs_refcountbt_key_diff,
+ .buf_ops = &xfs_refcountbt_buf_ops,
+ .diff_two_keys = xfs_refcountbt_diff_two_keys,
+ .keys_inorder = xfs_refcountbt_keys_inorder,
+ .recs_inorder = xfs_refcountbt_recs_inorder,
+};
+
+/*
+ * Initialize a new refcount btree cursor.
+ */
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+
+ ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
+
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ cur->bc_ag.refc.nr_ops = 0;
+ cur->bc_ag.refc.shape_changes = 0;
+ cur->bc_ops = &xfs_refcountbt_ops;
+ return cur;
+}
+
+/* Create a btree cursor. */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, tp, pag);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, NULL, pag);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Swap in the new btree root. Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+ agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+ agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+ XFS_AGF_REFCOUNT_ROOT |
+ XFS_AGF_REFCOUNT_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+}
+
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+ int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+ return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_reflink(mp)) {
+ mp->m_refc_maxlevels = 0;
+ return;
+ }
+
+ mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
+ mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+ ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+ struct xfs_mount *mp,
+ xfs_agblock_t agblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_refc_mxr[0] == 0)
+ return 0;
+
+ return xfs_refcountbt_calc_size(mp, agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_agblock_t agblocks;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = agbp->b_addr;
+ agblocks = be32_to_cpu(agf->agf_length);
+ tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+ xfs_trans_brelse(tp, agbp);
+
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
+ agblocks -= mp->m_sb.sb_logblocks;
+
+ *ask += xfs_refcountbt_max_size(mp, agblocks);
+ *used += tree_len;
+
+ return error;
+}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+ xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+ xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_refcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_refcountbt_cur_cache);
+ xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 000000000..d66b37259
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define __XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+ ((struct xfs_refcount_rec *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+ ((struct xfs_refcount_key *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_refcount_ptr_t *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct xfs_refcount_key) + \
+ ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
+extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
+ xfs_agblock_t agblocks);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask,
+ xfs_extlen_t *used);
+
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
+#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
new file mode 100644
index 000000000..b56aca1e7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -0,0 +1,2826 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_sb.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_inode.h"
+#include "xfs_ag.h"
+
+struct kmem_cache *xfs_rmap_intent_cache;
+
+/*
+ * Lookup the first record less than or equal to [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_le(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ int get_stat = 0;
+ int error;
+
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = 0;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/*
+ * Lookup the record exactly matching [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ int *stat)
+{
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, owner, offset].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_rmap_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec)
+{
+ union xfs_btree_rec rec;
+ int error;
+
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ irec->rm_startblock, irec->rm_blockcount,
+ irec->rm_owner, irec->rm_offset, irec->rm_flags);
+
+ rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
+ rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount);
+ rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner);
+ rec.rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(irec));
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_rmap_update_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+int
+xfs_rmap_insert(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+
+ rcur->bc_rec.r.rm_startblock = agbno;
+ rcur->bc_rec.r.rm_blockcount = len;
+ rcur->bc_rec.r.rm_owner = owner;
+ rcur->bc_rec.r.rm_offset = offset;
+ rcur->bc_rec.r.rm_flags = flags;
+ error = xfs_btree_insert(rcur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+done:
+ if (error)
+ trace_xfs_rmap_insert_error(rcur->bc_mp,
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+STATIC int
+xfs_rmap_delete(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+
+ error = xfs_btree_delete(rcur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+done:
+ if (error)
+ trace_xfs_rmap_delete_error(rcur->bc_mp,
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/* Convert an internal btree record to an rmap record. */
+int
+xfs_rmap_btrec_to_irec(
+ const union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec)
+{
+ irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
+ irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
+ irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
+ return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset),
+ irec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ if (xfs_rmap_btrec_to_irec(rec, irec))
+ goto out_bad_rec;
+
+ if (irec->rm_blockcount == 0)
+ goto out_bad_rec;
+ if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) {
+ if (irec->rm_owner != XFS_RMAP_OWN_FS)
+ goto out_bad_rec;
+ if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+ goto out_bad_rec;
+ } else {
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
+ goto out_bad_rec;
+ }
+
+ if (!(xfs_verify_ino(mp, irec->rm_owner) ||
+ (irec->rm_owner <= XFS_RMAP_OWN_FS &&
+ irec->rm_owner >= XFS_RMAP_OWN_MIN)))
+ goto out_bad_rec;
+
+ return 0;
+out_bad_rec:
+ xfs_warn(mp,
+ "Reverse Mapping BTree record corruption in AG %d detected!",
+ pag->pag_agno);
+ xfs_warn(mp,
+ "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
+ irec->rm_owner, irec->rm_flags, irec->rm_startblock,
+ irec->rm_blockcount);
+ return -EFSCORRUPTED;
+}
+
+struct xfs_find_left_neighbor_info {
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_irec *irec;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return 0;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+ return 0;
+
+ *info->irec = *rec;
+ return -ECANCELED;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+STATIC int
+xfs_rmap_find_left_neighbor(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int found = 0;
+ int error;
+
+ *stat = 0;
+ if (bno == 0)
+ return 0;
+ info.high.rm_startblock = bno - 1;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+ !(flags & XFS_RMAP_BMBT_BLOCK)) {
+ if (offset == 0)
+ return 0;
+ info.high.rm_offset = offset - 1;
+ } else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ info.irec = irec;
+
+ trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return 0;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ (rec->rm_offset > info->high.rm_offset ||
+ rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+ return 0;
+
+ *info->irec = *rec;
+ return -ECANCELED;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges. This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int found = 0;
+ int error;
+
+ info.high.rm_startblock = bno;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+ info.high.rm_offset = offset;
+ else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ *stat = 0;
+ info.irec = irec;
+
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
+}
+
+/*
+ * Perform all the relevant owner checks for a removal op. If we're doing an
+ * unknown-owner removal then we have no owner information to check.
+ */
+static int
+xfs_rmap_free_check_owner(
+ struct xfs_mount *mp,
+ uint64_t ltoff,
+ struct xfs_rmap_irec *rec,
+ xfs_filblks_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int error = 0;
+
+ if (owner == XFS_RMAP_OWN_UNKNOWN)
+ return 0;
+
+ /* Make sure the unwritten flag matches. */
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Check the offset, if necessary. */
+ if (XFS_RMAP_NON_INODE_OWNER(owner))
+ goto out;
+
+ if (flags & XFS_RMAP_BMBT_BLOCK) {
+ if (XFS_IS_CORRUPT(mp,
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ } else {
+ if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ offset + len > ltoff + rec->rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Find the extent in the rmap btree and remove it.
+ *
+ * The record we find should always be an exact match for the extent that we're
+ * looking for, since we insert them into the btree without modification.
+ *
+ * Special Case #1: when growing the filesystem, we "free" an extent when
+ * growing the last AG. This extent is new space and so it is not tracked as
+ * used space in the btree. The growfs code will pass in an owner of
+ * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this
+ * extent. We verify that - the extent lookup result in a record that does not
+ * overlap.
+ *
+ * Special Case #2: EFIs do not record the owner of the extent, so when
+ * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap
+ * btree to ignore the owner (i.e. wildcard match) so we don't trigger
+ * corruption checks during log recovery.
+ */
+STATIC int
+xfs_rmap_unmap(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ ltoff = ltrec.rm_offset;
+
+ /*
+ * For growfs, the incoming extent must be beyond the left record we
+ * just found as it is new space and won't be used by anyone. This is
+ * just a corruption check as we don't actually do anything with this
+ * extent. Note that we need to use >= instead of > because it might
+ * be the case that the "left" extent goes all the way to EOFS.
+ */
+ if (owner == XFS_RMAP_OWN_NULL) {
+ if (XFS_IS_CORRUPT(mp,
+ bno <
+ ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ goto out_done;
+ }
+
+ /*
+ * If we're doing an unknown-owner removal for EFI recovery, we expect
+ * to find the full range in the rmapbt or nothing at all. If we
+ * don't find any rmaps overlapping either end of the range, we're
+ * done. Hopefully this means that the EFI creator already queued
+ * (and finished) a RUI to remove the rmap.
+ */
+ if (owner == XFS_RMAP_OWN_UNKNOWN &&
+ ltrec.rm_startblock + ltrec.rm_blockcount <= bno) {
+ struct xfs_rmap_irec rtrec;
+
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto out_error;
+ if (i == 0)
+ goto out_done;
+ error = xfs_rmap_get_rec(cur, &rtrec, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (rtrec.rm_startblock >= bno + len)
+ goto out_done;
+ }
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /* Check owner information. */
+ error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+ offset, flags);
+ if (error)
+ goto out_error;
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* exact match, simply remove the record from rmap tree */
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ ltrec.rm_startblock, ltrec.rm_blockcount,
+ ltrec.rm_owner, ltrec.rm_offset,
+ ltrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ if (!ignore_off)
+ ltrec.rm_offset += len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * overlap right hand side of extent: trim the length and update
+ * the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+
+ /*
+ * overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto out_error;
+
+ cur->bc_rec.r.rm_startblock = bno + len;
+ cur->bc_rec.r.rm_blockcount = orig_len - len -
+ ltrec.rm_blockcount;
+ cur->bc_rec.r.rm_owner = ltrec.rm_owner;
+ if (ignore_off)
+ cur->bc_rec.r.rm_offset = 0;
+ else
+ cur->bc_rec.r.rm_offset = offset + len;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
+ cur->bc_rec.r.rm_startblock,
+ cur->bc_rec.r.rm_blockcount,
+ cur->bc_rec.r.rm_owner,
+ cur->bc_rec.r.rm_offset,
+ cur->bc_rec.r.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ }
+
+out_done:
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_free(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+
+ error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * A mergeable rmap must have the same owner and the same values for
+ * the unwritten, attr_fork, and bmbt flags. The startblock and
+ * offset are checked separately.
+ */
+static bool
+xfs_rmap_is_mergeable(
+ struct xfs_rmap_irec *irec,
+ uint64_t owner,
+ unsigned int flags)
+{
+ if (irec->rm_owner == XFS_RMAP_OWN_NULL)
+ return false;
+ if (irec->rm_owner != owner)
+ return false;
+ if ((flags & XFS_RMAP_UNWRITTEN) ^
+ (irec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return false;
+ if ((flags & XFS_RMAP_ATTR_FORK) ^
+ (irec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return false;
+ if ((flags & XFS_RMAP_BMBT_BLOCK) ^
+ (irec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ return false;
+ return true;
+}
+
+/*
+ * When we allocate a new block, the first thing we do is add a reference to
+ * the extent in the rmap btree. This takes the form of a [agbno, length,
+ * owner, offset] record. Flags are encoded in the high bits of the offset
+ * field.
+ */
+STATIC int
+xfs_rmap_map(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(owner != 0);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+ ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
+ &have_lt);
+ if (error)
+ goto out_error;
+ if (have_lt) {
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+ }
+
+ if (XFS_IS_CORRUPT(mp,
+ have_lt != 0 &&
+ ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ /*
+ * Note: cursor currently points one record to the right of ltrec, even
+ * if there is no record in the tree to the right.
+ */
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) {
+ /*
+ * left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset) &&
+ (unsigned long)ltrec.rm_blockcount + len +
+ gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) {
+ /*
+ * right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ gtrec.rm_startblock,
+ gtrec.rm_blockcount,
+ gtrec.rm_owner,
+ gtrec.rm_offset,
+ gtrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+
+ /* point the cursor back to the left record and update */
+ error = xfs_btree_decrement(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset)) {
+ /*
+ * right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ if (!ignore_off)
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_update(cur, &gtrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * no contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ owner, offset, flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_alloc(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_rmap_map(cur, bno, len, false, oinfo);
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+#define RMAP_LEFT_CONTIG (1 << 0)
+#define RMAP_RIGHT_CONTIG (1 << 1)
+#define RMAP_LEFT_FILLING (1 << 2)
+#define RMAP_RIGHT_FILLING (1 << 3)
+#define RMAP_LEFT_VALID (1 << 6)
+#define RMAP_RIGHT_VALID (1 << 7)
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+#define NEW r[3]
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.
+ * Does not handle overlapping extents.
+ */
+STATIC int
+xfs_rmap_convert(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /*
+ * Decrement the cursor to see if we have a left-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ error = xfs_rmap_get_rec(cur, &LEFT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
+ LEFT.rm_blockcount, LEFT.rm_owner,
+ LEFT.rm_offset, LEFT.rm_flags);
+ if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
+ LEFT.rm_offset + LEFT.rm_blockcount == offset &&
+ xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (bno + len == RIGHT.rm_startblock &&
+ offset + len == RIGHT.rm_offset &&
+ xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
+ _RET_IP_);
+
+ /* reset the cursor back to PREV */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW = PREV;
+ NEW.rm_blockcount = len + RIGHT.rm_blockcount;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_startblock += len;
+ NEW.rm_offset += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ NEW.rm_blockcount = offset - PREV.rm_offset;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
+ NEW.rm_startblock, NEW.rm_blockcount,
+ NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ /* new middle extent - newext */
+ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
+ cur->bc_rec.r.rm_flags |= newext;
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa. If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for and exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext,
+ &PREV, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+ &LEFT, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ newext, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
+ _RET_IP_);
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount += RIGHT.rm_blockcount;
+ NEW.rm_flags = RIGHT.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new middle extent - newext */
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount = len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+#undef NEW
+#undef LEFT
+#undef RIGHT
+#undef PREV
+
+/*
+ * Find an extent in the rmap btree and unmap it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ &ltrec, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ ltoff = ltrec.rm_offset;
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /* Make sure the unwritten flag matches. */
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ /* Check the offset. */
+ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* Exact match, simply remove the record from rmap tree. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * Overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+
+ /* Delete prev rmap. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset. */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ ltrec.rm_offset += len;
+ error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * Overlap right hand side of extent: trim the length and
+ * update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * Overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ /* Shrink the left side of the rmap */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset */
+ error = xfs_rmap_insert(cur, bno + len,
+ orig_len - len - ltrec.rm_blockcount,
+ ltrec.rm_owner, offset + len,
+ ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+ &ltrec, &have_lt);
+ if (error)
+ goto out_error;
+ if (have_lt &&
+ !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ flags, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+ /*
+ * Left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ /* Point the cursor back to the left record and update. */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ /* Delete the old record. */
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Move the start and re-add it. */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * No contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ return error;
+}
+
+/* Insert a raw rmap into the rmapbt. */
+int
+xfs_rmap_map_raw(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rmap)
+{
+ struct xfs_owner_info oinfo;
+
+ oinfo.oi_owner = rmap->rm_owner;
+ oinfo.oi_offset = rmap->rm_offset;
+ oinfo.oi_flags = 0;
+ if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+ if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return xfs_rmap_map(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+
+ return xfs_rmap_map_shared(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+}
+
+struct xfs_rmap_query_range_info {
+ xfs_rmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_rmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info *query = priv;
+ struct xfs_rmap_irec irec;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (error)
+ return error;
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all rmaps between two keys. */
+int
+xfs_rmap_query_range(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_rmap_query_range_info query;
+
+ low_brec.r = *low_rec;
+ high_brec.r = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_rmap_query_range_helper, &query);
+}
+
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info query;
+
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+void
+xfs_rmap_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_ag.agbp;
+ xfs_btree_del_cursor(rcur, error);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred rmap operations. We pass back the
+ * btree cursor to maintain our lock on the rmapbt between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_rmap_finish_one(
+ struct xfs_trans *tp,
+ enum xfs_rmap_intent_type type,
+ uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ struct xfs_owner_info oinfo;
+ xfs_agblock_t bno;
+ bool unwritten;
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
+ startoff, blockcount, state);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
+
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
+ xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ /*
+ * Refresh the freelist before we start changing the
+ * rmapbt, because a shape change could cause us to
+ * allocate blocks.
+ */
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+ if (error)
+ goto out_drop;
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ error = -EFSCORRUPTED;
+ goto out_drop;
+ }
+
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ }
+ *pcur = rcur;
+
+ xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+ unwritten = state == XFS_EXT_UNWRITTEN;
+ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+
+ switch (type) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+ break;
+ case XFS_RMAP_MAP_SHARED:
+ error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_UNMAP_SHARED:
+ error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_CONVERT:
+ error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_CONVERT_SHARED:
+ error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+ !unwritten, &oinfo);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+out_drop:
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Don't defer an rmap if we aren't an rmap filesystem.
+ */
+static bool
+xfs_rmap_update_is_needed(
+ struct xfs_mount *mp,
+ int whichfork)
+{
+ return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK;
+}
+
+/*
+ * Record a rmap intent; the list is kept sorted first by AG and then by
+ * increasing age.
+ */
+static void
+__xfs_rmap_add(
+ struct xfs_trans *tp,
+ enum xfs_rmap_intent_type type,
+ uint64_t owner,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ struct xfs_rmap_intent *ri;
+
+ trace_xfs_rmap_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+ owner, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_owner = owner;
+ ri->ri_whichfork = whichfork;
+ ri->ri_bmap = *bmap;
+
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+}
+
+/* Map an extent into a file. */
+void
+xfs_rmap_map_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
+ if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
+ return;
+
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_MAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/* Unmap an extent out of a file. */
+void
+xfs_rmap_unmap_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
+ if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
+ return;
+
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_UNMAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/*
+ * Convert a data fork extent from unwritten to real or vice versa.
+ *
+ * Note that tp can be NULL here as no transaction is used for COW fork
+ * unwritten conversion.
+ */
+void
+xfs_rmap_convert_extent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
+ return;
+
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_CONVERT_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+}
+
+/* Schedule the creation of an rmap for non-file data. */
+void
+xfs_rmap_alloc_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
+ return;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+}
+
+/* Schedule the deletion of an rmap for non-file data. */
+void
+xfs_rmap_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
+ return;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+}
+
+/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+ const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b)
+{
+ __u64 oa;
+ __u64 ob;
+
+ oa = xfs_rmap_irec_offset_pack(a);
+ ob = xfs_rmap_irec_offset_pack(b);
+
+ if (a->rm_startblock < b->rm_startblock)
+ return -1;
+ else if (a->rm_startblock > b->rm_startblock)
+ return 1;
+ else if (a->rm_owner < b->rm_owner)
+ return -1;
+ else if (a->rm_owner > b->rm_owner)
+ return 1;
+ else if (oa < ob)
+ return -1;
+ else if (oa > ob)
+ return 1;
+ else
+ return 0;
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_rmap_has_record(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool *exists)
+{
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+
+ memset(&low, 0, sizeof(low));
+ low.r.rm_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.r.rm_startblock = bno + len - 1;
+
+ return xfs_btree_has_record(cur, &low, &high, exists);
+}
+
+/*
+ * Is there a record for this owner completely covering a given physical
+ * extent? If so, *has_rmap will be set to true. If there is no record
+ * or the record only covers part of the range, we set *has_rmap to false.
+ * This function doesn't perform range lookups or offset checks, so it is
+ * not suitable for checking data fork blocks.
+ */
+int
+xfs_rmap_record_exists(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool *has_rmap)
+{
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ int has_record;
+ struct xfs_rmap_irec irec;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK));
+
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
+ &has_record);
+ if (error)
+ return error;
+ if (!has_record) {
+ *has_rmap = false;
+ return 0;
+ }
+
+ *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
+ irec.rm_startblock + irec.rm_blockcount >= bno + len);
+ return 0;
+}
+
+struct xfs_rmap_key_state {
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+};
+
+/* For each rmap given, figure out if it doesn't match the key we want. */
+STATIC int
+xfs_rmap_has_other_keys_helper(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_rmap_key_state *rks = priv;
+
+ if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
+ ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
+ return 0;
+ return -ECANCELED;
+}
+
+/*
+ * Given an extent and some owner info, can we find records overlapping
+ * the extent whose owner info does not match the given owner?
+ */
+int
+xfs_rmap_has_other_keys(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool *has_rmap)
+{
+ struct xfs_rmap_irec low = {0};
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_key_state rks;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
+ *has_rmap = false;
+
+ low.rm_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno + len - 1;
+
+ error = xfs_rmap_query_range(cur, &low, &high,
+ xfs_rmap_has_other_keys_helper, &rks);
+ if (error == -ECANCELED) {
+ *has_rmap = true;
+ return 0;
+ }
+
+ return error;
+}
+
+const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+ .oi_owner = XFS_RMAP_OWN_NULL,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = {
+ .oi_owner = XFS_RMAP_OWN_UNKNOWN,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_FS = {
+ .oi_owner = XFS_RMAP_OWN_FS,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_LOG = {
+ .oi_owner = XFS_RMAP_OWN_LOG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_AG = {
+ .oi_owner = XFS_RMAP_OWN_AG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = {
+ .oi_owner = XFS_RMAP_OWN_INOBT,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INODES = {
+ .oi_owner = XFS_RMAP_OWN_INODES,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
+ .oi_owner = XFS_RMAP_OWN_REFC,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
+ .oi_owner = XFS_RMAP_OWN_COW,
+};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+ xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+ sizeof(struct xfs_rmap_intent),
+ 0, 0, NULL);
+
+ return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_rmap_intent_cache);
+ xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
new file mode 100644
index 000000000..54741a591
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_RMAP_H__
+#define __XFS_RMAP_H__
+
+struct xfs_perag;
+
+static inline void
+xfs_rmap_ino_bmbt_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = 0;
+ oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_ino_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = offset;
+ oi->oi_flags = 0;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline bool
+xfs_rmap_should_skip_owner_update(
+ const struct xfs_owner_info *oi)
+{
+ return oi->oi_owner == XFS_RMAP_OWN_NULL;
+}
+
+/* Reverse mapping functions. */
+
+struct xfs_buf;
+
+static inline __u64
+xfs_rmap_irec_offset_pack(
+ const struct xfs_rmap_irec *irec)
+{
+ __u64 x;
+
+ x = XFS_RMAP_OFF(irec->rm_offset);
+ if (irec->rm_flags & XFS_RMAP_ATTR_FORK)
+ x |= XFS_RMAP_OFF_ATTR_FORK;
+ if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ x |= XFS_RMAP_OFF_BMBT_BLOCK;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ x |= XFS_RMAP_OFF_UNWRITTEN;
+ return x;
+}
+
+static inline int
+xfs_rmap_irec_offset_unpack(
+ __u64 offset,
+ struct xfs_rmap_irec *irec)
+{
+ if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
+ return -EFSCORRUPTED;
+ irec->rm_offset = XFS_RMAP_OFF(offset);
+ irec->rm_flags = 0;
+ if (offset & XFS_RMAP_OFF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (offset & XFS_RMAP_OFF_UNWRITTEN)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+ return 0;
+}
+
+static inline void
+xfs_owner_info_unpack(
+ const struct xfs_owner_info *oinfo,
+ uint64_t *owner,
+ uint64_t *offset,
+ unsigned int *flags)
+{
+ unsigned int r = 0;
+
+ *owner = oinfo->oi_owner;
+ *offset = oinfo->oi_offset;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ r |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ r |= XFS_RMAP_BMBT_BLOCK;
+ *flags = r;
+}
+
+static inline void
+xfs_owner_info_pack(
+ struct xfs_owner_info *oinfo,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ oinfo->oi_owner = owner;
+ oinfo->oi_offset = XFS_RMAP_OFF(offset);
+ oinfo->oi_flags = 0;
+ if (flags & XFS_RMAP_ATTR_FORK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+}
+
+int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo);
+int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo);
+
+int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags, int *stat);
+int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags);
+int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
+ int *stat);
+
+typedef int (*xfs_rmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv);
+
+int xfs_rmap_query_range(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+ void *priv);
+
+enum xfs_rmap_intent_type {
+ XFS_RMAP_MAP,
+ XFS_RMAP_MAP_SHARED,
+ XFS_RMAP_UNMAP,
+ XFS_RMAP_UNMAP_SHARED,
+ XFS_RMAP_CONVERT,
+ XFS_RMAP_CONVERT_SHARED,
+ XFS_RMAP_ALLOC,
+ XFS_RMAP_FREE,
+};
+
+struct xfs_rmap_intent {
+ struct list_head ri_list;
+ enum xfs_rmap_intent_type ri_type;
+ int ri_whichfork;
+ uint64_t ri_owner;
+ struct xfs_bmbt_irec ri_bmap;
+};
+
+/* functions for updating the rmapbt based on bmbt map/unmap operations */
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, struct xfs_bmbt_irec *imap);
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, struct xfs_bmbt_irec *imap);
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+
+void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+ uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+ xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b);
+union xfs_btree_rec;
+int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec);
+int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exists);
+int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ bool *has_rmap);
+int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ bool *has_rmap);
+int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
+
+extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_FS;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_AG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+
+extern struct kmem_cache *xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
+#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
new file mode 100644
index 000000000..7f83f62e5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -0,0 +1,696 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+
+static struct kmem_cache *xfs_rmapbt_cur_cache;
+
+/*
+ * Reverse map btree.
+ *
+ * This is a per-ag tree used to track the owner(s) of a given extent. With
+ * reflink it is possible for there to be multiple owners, which is a departure
+ * from classic XFS. Owner records for data extents are inserted when the
+ * extent is mapped and removed when an extent is unmapped. Owner records for
+ * all other block types (i.e. metadata) are inserted when an extent is
+ * allocated and removed when an extent is freed. There can only be one owner
+ * of a metadata extent, usually an inode or some other metadata structure like
+ * an AG btree.
+ *
+ * The rmap btree is part of the free space management, so blocks for the tree
+ * are sourced from the agfl. Hence we need transaction reservation support for
+ * this tree so that the freelist is always large enough. This also impacts on
+ * the minimum space we need to leave free in the AG.
+ *
+ * The tree is ordered by [ag block, owner, offset]. This is a large key size,
+ * but it is the only way to enforce unique keys when a block can be owned by
+ * multiple files at any offset. There's no need to order/search by extent
+ * size for online updating/management of the tree. It is intended that most
+ * reverse lookups will be to find the owner(s) of a particular block, or to
+ * try to recover tree and file data from corrupt primary metadata.
+ */
+
+static struct xfs_btree_cur *
+xfs_rmapbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_ag.agbp, cur->bc_ag.pag);
+}
+
+STATIC void
+xfs_rmapbt_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ int btnum = cur->bc_btnum;
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_rmapbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ int error;
+ xfs_agblock_t bno;
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
+ &bno, 1);
+ if (error)
+ return error;
+
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
+ if (bno == NULLAGBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+
+ xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
+
+ new->s = cpu_to_be32(bno);
+ be32_add_cpu(&agf->agf_rmap_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
+
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
+
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+ trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
+ bno, 1);
+ be32_add_cpu(&agf->agf_rmap_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
+ error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
+ if (error)
+ return error;
+
+ xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rmapbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mxr[level != 0];
+}
+
+STATIC void
+xfs_rmapbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+}
+
+/*
+ * The high key for a reverse mapping record can be computed by shifting
+ * the startblock and offset to the highest value that would still map
+ * to that record. In practice this means that we add blockcount-1 to
+ * the startblock for all records, and if the record is for a data/attr
+ * fork mapping, we add blockcount-1 to the offset too.
+ */
+STATIC void
+xfs_rmapbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ uint64_t off;
+ int adj;
+
+ adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ be32_add_cpu(&key->rmap.rm_startblock, adj);
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+ if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+ XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+ return;
+ off = be64_to_cpu(key->rmap.rm_offset);
+ off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+ key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rmapbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+ rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+ rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+ rec->rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rmapbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
+
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+
+ ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC int64_t
+xfs_rmapbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ const struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ int64_t d;
+
+ d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp->rm_owner);
+ y = rec->rm_owner;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
+ y = rec->rm_offset;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+STATIC int64_t
+xfs_rmapbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct xfs_rmap_key *kp1 = &k1->rmap;
+ const struct xfs_rmap_key *kp2 = &k2->rmap;
+ int64_t d;
+ __u64 x, y;
+
+ d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+ be32_to_cpu(kp2->rm_startblock);
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
+ y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_rmapbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ xfs_failaddr_t fa;
+ unsigned int level;
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
+ */
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_rmapbt(mp))
+ return __this_address;
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ return __this_address;
+ } else if (level >= mp->m_rmap_maxlevels)
+ return __this_address;
+
+ return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+}
+
+static void
+xfs_rmapbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rmapbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rmapbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rmapbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
+ .name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+ .verify_read = xfs_rmapbt_read_verify,
+ .verify_write = xfs_rmapbt_write_verify,
+ .verify_struct = xfs_rmapbt_verify,
+};
+
+STATIC int
+xfs_rmapbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(k1->rmap.rm_startblock);
+ y = be32_to_cpu(k2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(k1->rmap.rm_owner);
+ b = be64_to_cpu(k2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(r1->rmap.rm_startblock);
+ y = be32_to_cpu(r2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(r1->rmap.rm_owner);
+ b = be64_to_cpu(r2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+static const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+
+ .dup_cursor = xfs_rmapbt_dup_cursor,
+ .set_root = xfs_rmapbt_set_root,
+ .alloc_block = xfs_rmapbt_alloc_block,
+ .free_block = xfs_rmapbt_free_block,
+ .get_minrecs = xfs_rmapbt_get_minrecs,
+ .get_maxrecs = xfs_rmapbt_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+};
+
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+
+ /* Overlapping btree; 2 keys per pointer. */
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
+ cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+ cur->bc_ops = &xfs_rmapbt_ops;
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_rmapbt_init_common(mp, tp, pag);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_rmapbt_init_common(mp, NULL, pag);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+ XFS_AGF_RMAP_BLOCKS);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+}
+
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rmap btree block.
+ */
+int
+xfs_rmapbt_maxrecs(
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_RMAP_BLOCK_LEN;
+ return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. However, we're likely to run
+ * out of blocks in the AG long before that happens, which means that
+ * we must compute the max height based on what the btree will look
+ * like if it consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+}
+
+/* Compute the maximum height of an rmap btree. */
+void
+xfs_rmapbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_rmapbt(mp)) {
+ mp->m_rmap_maxlevels = 0;
+ return;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * Compute the asymptotic maxlevels for an rmap btree on a
+ * filesystem that supports reflink.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32
+ * (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records.
+ * However, we're likely to run out of blocks in the AG long
+ * before that happens, which means that we must compute the
+ * max height based on what the btree will look like if it
+ * consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+ mp->m_sb.sb_agblocks);
+ } else {
+ /*
+ * If there's no block sharing, compute the maximum rmapbt
+ * height assuming one rmap record per AG block.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
+ mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ }
+ ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+ struct xfs_mount *mp,
+ xfs_agblock_t agblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rmap_mxr[0] == 0)
+ return 0;
+
+ return xfs_rmapbt_calc_size(mp, agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_agblock_t agblocks;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = agbp->b_addr;
+ agblocks = be32_to_cpu(agf->agf_length);
+ tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+ xfs_trans_brelse(tp, agbp);
+
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
+ agblocks -= mp->m_sb.sb_logblocks;
+
+ /* Reserve 1% of the AG or enough for 1 block per record. */
+ *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
+ *used += tree_len;
+
+ return error;
+}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+ xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rmapbt_cur_cache);
+ xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
new file mode 100644
index 000000000..3244715dd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RMAP_BTREE_H__
+#define __XFS_RMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_afakeroot;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_RMAP_REC_ADDR(block, index) \
+ ((struct xfs_rmap_rec *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_rmap_rec))))
+
+#define XFS_RMAP_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_rmap_ptr_t *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * sizeof(xfs_rmap_ptr_t)))
+
+struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+int xfs_rmapbt_maxrecs(int blocklen, int leaf);
+extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
+ xfs_agblock_t agblocks);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000..fa180ab66
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,1098 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+static void
+xfs_rtbuf_verify_write(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+ .name = "rtbuf",
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+};
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t block, /* block number in bitmap or summary */
+ int issum, /* is summary not bitmap */
+ struct xfs_buf **bpp) /* output: buffer for the block */
+{
+ struct xfs_buf *bp; /* block buffer, result */
+ xfs_inode_t *ip; /* bitmap or summary inode */
+ xfs_bmbt_irec_t map;
+ int nmap = 1;
+ int error; /* error value */
+
+ ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+ return -EFSCORRUPTED;
+
+ ASSERT(map.br_startblock != NULLFSBLOCK);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ if (error)
+ return error;
+
+ xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+ : XFS_BLFT_RTBITMAP_BUF);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ struct xfs_buf *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t firstbit; /* first useful bit in the word */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = start - limit + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit < XFS_NBWORD - 1) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+ firstbit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = bit - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i = bit - firstbit + 1;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the previous one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if (len - i) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_NBWORD - (len - i);
+ mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start - i + 1;
+ return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ struct xfs_buf *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in the word */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = limit - start + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit) {
+ /*
+ * Calculate last (rightmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Calculate mask for all the relevant bits in this word.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start + i - 1;
+ return 0;
+}
+
+/*
+ * Read and/or modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
+ */
+int
+xfs_rtmodify_summary_int(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
+{
+ struct xfs_buf *bp; /* buffer for the summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (*rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (*rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ /*
+ * Point to the summary information, modify/log it, and/or copy it out.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ if (delta) {
+ uint first = (uint)((char *)sp - (char *)bp->b_addr);
+
+ *sp += delta;
+ if (mp->m_rsum_cache) {
+ if (*sp == 0 && log == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno]++;
+ if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ }
+ xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+ }
+ if (sum)
+ *sum = *sp;
+ return 0;
+}
+
+int
+xfs_rtmodify_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+ delta, rbpp, rsb, NULL);
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to modify */
+ xfs_extlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ struct xfs_buf *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtword_t *first; /* first used word in the buffer */
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask o frelevant bits for value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block, and point to its data.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ first = b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zeroes; 1 (free) => all ones.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not changed and mask of relevant bits.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ i = lastbit - bit;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Set the word value correctly.
+ */
+ *b = val;
+ i += XFS_NBWORD;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Compute a mask of relevant bits.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ b++;
+ }
+ /*
+ * Log any remaining changed bytes.
+ */
+ if (b > first)
+ xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp - 1));
+ return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to free */
+ xfs_extlen_t len, /* length to free */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock; /* first block freed > end */
+ xfs_rtblock_t preblock; /* first block freed < start */
+
+ end = start + len - 1;
+ /*
+ * Modify the bitmap to mark this extent freed.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ if (error) {
+ return error;
+ }
+ /*
+ * Assume we're freeing out of the middle of an allocated extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of allocated extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error)
+ return error;
+ /*
+ * If there are blocks not being freed at the front of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being freed at the end of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Increment the summary information corresponding to the entire
+ * (new) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtblock_t *new, /* out: first block not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ struct xfs_buf *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zero's; 1 (free) => all one's.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not examined.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ /*
+ * Mask of relevant bits.
+ */
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ val)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Mask of relevant bits.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * Successful, return.
+ */
+ xfs_trans_brelse(tp, bp);
+ *new = start + i;
+ *stat = 1;
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int /* error */
+xfs_rtcheck_alloc_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
+
+ error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ if (error)
+ return error;
+ ASSERT(stat);
+ return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#endif
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len) /* length of extent freed */
+{
+ int error; /* error value */
+ xfs_mount_t *mp; /* file system mount structure */
+ xfs_fsblock_t sb; /* summary file block number */
+ struct xfs_buf *sumbp = NULL; /* summary file block buffer */
+
+ mp = tp->t_mountp;
+
+ ASSERT(mp->m_rbmip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+ error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ if (error)
+ return error;
+
+ /*
+ * Free the range of realtime blocks.
+ */
+ error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+ if (error) {
+ return error;
+ }
+ /*
+ * Mark more blocks free in the superblock.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+ /*
+ * If we've now freed all the blocks, reset the file sequence
+ * number to 0.
+ */
+ if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ mp->m_sb.sb_rextents) {
+ if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
+ mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ }
+ return 0;
+}
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec rec;
+ xfs_rtblock_t rtstart;
+ xfs_rtblock_t rtend;
+ xfs_rtblock_t high_key;
+ int is_free;
+ int error = 0;
+
+ if (low_rec->ar_startext > high_rec->ar_startext)
+ return -EINVAL;
+ if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
+ low_rec->ar_startext == high_rec->ar_startext)
+ return 0;
+
+ high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
+
+ /* Iterate the bitmap, looking for discrepancies. */
+ rtstart = low_rec->ar_startext;
+ while (rtstart <= high_key) {
+ /* Is the first block free? */
+ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ &is_free);
+ if (error)
+ break;
+
+ /* How long does the extent go for? */
+ error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+ if (error)
+ break;
+
+ if (is_free) {
+ rec.ar_startext = rtstart;
+ rec.ar_extcount = rtend - rtstart + 1;
+
+ error = fn(mp, tp, &rec, priv);
+ if (error)
+ break;
+ }
+
+ rtstart = rtend + 1;
+ }
+
+ return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec keys[2];
+
+ keys[0].ar_startext = 0;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
+ keys[0].ar_extcount = keys[1].ar_extcount = 0;
+
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
+}
+
+/* Is the given extent all free? */
+int
+xfs_rtalloc_extent_is_free(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_rtblock_t start,
+ xfs_extlen_t len,
+ bool *is_free)
+{
+ xfs_rtblock_t end;
+ int matches;
+ int error;
+
+ error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+ if (error)
+ return error;
+
+ *is_free = matches;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
new file mode 100644
index 000000000..b6a584e04
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -0,0 +1,1317 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_log.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
+{
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
+
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
+}
+
+/*
+ * We current support XFS v5 formats with known features and v4 superblocks with
+ * at least V2 directories.
+ */
+bool
+xfs_sb_good_version(
+ struct xfs_sb *sbp)
+{
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
+ if (xfs_sb_is_v5(sbp))
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* versions prior to v4 are not supported */
+ if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4)
+ return false;
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
+
+ /* V4 filesystems need v2 directories and unwritten extents */
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
+
+ /* It's a supported v4 filesystem */
+ return true;
+}
+
+uint64_t
+xfs_sb_version_to_features(
+ struct xfs_sb *sbp)
+{
+ uint64_t features = 0;
+
+ /* optional V4 features */
+ if (sbp->sb_rblocks > 0)
+ features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
+ features |= XFS_FEAT_ATTR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
+ features |= XFS_FEAT_QUOTA;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)
+ features |= XFS_FEAT_ALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)
+ features |= XFS_FEAT_LOGV2;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)
+ features |= XFS_FEAT_DALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)
+ features |= XFS_FEAT_EXTFLG;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT)
+ features |= XFS_FEAT_SECTOR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT)
+ features |= XFS_FEAT_ASCIICI;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
+ if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
+ features |= XFS_FEAT_LAZYSBCOUNT;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ features |= XFS_FEAT_ATTR2;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
+ features |= XFS_FEAT_PROJID32;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ }
+
+ if (!xfs_sb_is_v5(sbp))
+ return features;
+
+ /* Always on V5 features */
+ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
+ XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+ XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
+
+ /* Optional V5 features */
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT)
+ features |= XFS_FEAT_FINOBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+ features |= XFS_FEAT_RMAPBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ features |= XFS_FEAT_REFLINK;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+ features |= XFS_FEAT_INOBTCNT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
+ features |= XFS_FEAT_SPINODES;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ features |= XFS_FEAT_META_UUID;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME)
+ features |= XFS_FEAT_BIGTIME;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
+ return features;
+}
+
+/* Check all the superblock fields we care about when reading one in. */
+STATIC int
+xfs_validate_sb_read(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ if (!xfs_sb_is_v5(sbp))
+ return 0;
+
+ /*
+ * Version 5 superblock feature mask validation. Reject combinations
+ * the kernel cannot support up front before checking anything else.
+ */
+ if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.",
+ (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Using a more recent kernel is recommended.");
+ }
+
+ if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ if (!xfs_is_readonly(mp)) {
+ xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.");
+ xfs_warn(mp,
+"Filesystem can only be safely mounted read only.");
+
+ return -EINVAL;
+ }
+ }
+ if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.",
+ (sbp->sb_features_incompat &
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Filesystem cannot be safely mounted by this kernel.");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Check all the superblock fields we care about when writing one out. */
+STATIC int
+xfs_validate_sb_write(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_sb *sbp)
+{
+ /*
+ * Carry out additional sb summary counter sanity checks when we write
+ * the superblock. We skip this in the read validator because there
+ * could be newer superblocks in the log and if the values are garbage
+ * even after replay we'll recalculate them at the end of log mount.
+ *
+ * mkfs has traditionally written zeroed counters to inprogress and
+ * secondary superblocks, so allow this usage to continue because
+ * we never read counters from such superblocks.
+ */
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
+ (sbp->sb_fdblocks > sbp->sb_dblocks ||
+ !xfs_verify_icount(mp, sbp->sb_icount) ||
+ sbp->sb_ifree > sbp->sb_icount)) {
+ xfs_warn(mp, "SB summary counter sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_sb_is_v5(sbp))
+ return 0;
+
+ /*
+ * Version 5 superblock feature mask validation. Reject combinations
+ * the kernel cannot support since we checked for unsupported bits in
+ * the read verifier, which means that memory is corrupt.
+ */
+ if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Corruption detected in superblock compatible features (0x%x)!",
+ (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_alert(mp,
+"Corruption detected in superblock read-only compatible features (0x%x)!",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EFSCORRUPTED;
+ }
+ if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Corruption detected in superblock incompatible features (0x%x)!",
+ (sbp->sb_features_incompat &
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ return -EFSCORRUPTED;
+ }
+ if (xfs_sb_has_incompat_log_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+ xfs_warn(mp,
+"Corruption detected in superblock incompatible log features (0x%x)!",
+ (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * We can't read verify the sb LSN because the read verifier is called
+ * before the log is allocated and processed. We know the log is set up
+ * before write verifier calls, so check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Check the validity of the SB. */
+STATIC int
+xfs_validate_sb_common(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_sb *sbp)
+{
+ struct xfs_dsb *dsb = bp->b_addr;
+ uint32_t agcount = 0;
+ uint32_t rem;
+ bool has_dalign;
+
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
+ return -EWRONGFS;
+ }
+
+ if (!xfs_sb_good_version(sbp)) {
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
+ return -EWRONGFS;
+ }
+
+ /*
+ * Validate feature flags and state
+ */
+ if (xfs_sb_is_v5(sbp)) {
+ if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+ xfs_notice(mp,
+"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)",
+ sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE);
+ return -EFSCORRUPTED;
+ }
+
+ /* V5 has a separate project quota inode */
+ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+ xfs_notice(mp,
+ "Version 5 of Super block has XFS_OQUOTA bits.");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) {
+ uint32_t align;
+
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
+ }
+ } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+ xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits.");
+ return -EFSCORRUPTED;
+ }
+
+ if (unlikely(
+ sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+ xfs_warn(mp,
+ "filesystem is marked as having an external log; "
+ "specify logdev on the mount command line.");
+ return -EINVAL;
+ }
+
+ if (unlikely(
+ sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+ xfs_warn(mp,
+ "filesystem is marked as having an internal log; "
+ "do not specify logdev on the mount command line.");
+ return -EINVAL;
+ }
+
+ /* Compute agcount for this number of dblocks and agblocks */
+ if (sbp->sb_agblocks) {
+ agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem);
+ if (rem)
+ agcount++;
+ }
+
+ /*
+ * More sanity checking. Most of these were stolen directly from
+ * xfs_repair.
+ */
+ if (unlikely(
+ sbp->sb_agcount <= 0 ||
+ sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
+ sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
+ sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
+ sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
+ sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
+ sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
+ sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
+ sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
+ sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
+ sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
+ sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
+ sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
+ sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
+ sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
+ sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
+ sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
+ sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE ||
+ sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+ XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES ||
+ XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES ||
+ sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1 ||
+ agcount == 0 || agcount != sbp->sb_agcount ||
+ (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
+ (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
+ sbp->sb_dblocks == 0 ||
+ sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
+ sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
+ sbp->sb_shared_vn != 0)) {
+ xfs_notice(mp, "SB sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
+
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
+ }
+
+ /*
+ * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
+ * would imply the image is corrupted.
+ */
+ has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT;
+ if (!!sbp->sb_unit ^ has_dalign) {
+ xfs_notice(mp, "SB stripe alignment sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
+ XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+ return -EFSCORRUPTED;
+
+ /*
+ * Currently only very few inode sizes are supported.
+ */
+ switch (sbp->sb_inodesize) {
+ case 256:
+ case 512:
+ case 1024:
+ case 2048:
+ break;
+ default:
+ xfs_warn(mp, "inode size of %d bytes not supported",
+ sbp->sb_inodesize);
+ return -ENOSYS;
+ }
+
+ return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+ /*
+ * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+ * leads to in-core values having two different values for a quota
+ * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+ * NULLFSINO.
+ *
+ * Note that this change affect only the in-core values. These
+ * values are not written back to disk unless any quota information
+ * is written to the disk. Even in that case, sb_pquotino field is
+ * not written to disk unless the superblock supports pquotino.
+ */
+ if (sbp->sb_uquotino == 0)
+ sbp->sb_uquotino = NULLFSINO;
+ if (sbp->sb_gquotino == 0)
+ sbp->sb_gquotino = NULLFSINO;
+ if (sbp->sb_pquotino == 0)
+ sbp->sb_pquotino = NULLFSINO;
+
+ /*
+ * We need to do these manipilations only if we are working
+ * with an older version of on-disk superblock.
+ */
+ if (xfs_sb_is_v5(sbp))
+ return;
+
+ if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+ if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+ sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+ if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+ sbp->sb_gquotino != NULLFSINO) {
+ /*
+ * In older version of superblock, on-disk superblock only
+ * has sb_gquotino, and in-core superblock has both sb_gquotino
+ * and sb_pquotino. But, only one of them is supported at any
+ * point of time. So, if PQUOTA is set in disk superblock,
+ * copy over sb_gquotino to sb_pquotino. The NULLFSINO test
+ * above is to make sure we don't do this twice and wipe them
+ * both out!
+ */
+ sbp->sb_pquotino = sbp->sb_gquotino;
+ sbp->sb_gquotino = NULLFSINO;
+ }
+}
+
+static void
+__xfs_sb_from_disk(
+ struct xfs_sb *to,
+ struct xfs_dsb *from,
+ bool convert_xquota)
+{
+ to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+ to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+ to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+ to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+ to->sb_rextents = be64_to_cpu(from->sb_rextents);
+ memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+ to->sb_logstart = be64_to_cpu(from->sb_logstart);
+ to->sb_rootino = be64_to_cpu(from->sb_rootino);
+ to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+ to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+ to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+ to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+ to->sb_agcount = be32_to_cpu(from->sb_agcount);
+ to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+ to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+ to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+ to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+ to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+ to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+ memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+ to->sb_blocklog = from->sb_blocklog;
+ to->sb_sectlog = from->sb_sectlog;
+ to->sb_inodelog = from->sb_inodelog;
+ to->sb_inopblog = from->sb_inopblog;
+ to->sb_agblklog = from->sb_agblklog;
+ to->sb_rextslog = from->sb_rextslog;
+ to->sb_inprogress = from->sb_inprogress;
+ to->sb_imax_pct = from->sb_imax_pct;
+ to->sb_icount = be64_to_cpu(from->sb_icount);
+ to->sb_ifree = be64_to_cpu(from->sb_ifree);
+ to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+ to->sb_frextents = be64_to_cpu(from->sb_frextents);
+ to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+ to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+ to->sb_qflags = be16_to_cpu(from->sb_qflags);
+ to->sb_flags = from->sb_flags;
+ to->sb_shared_vn = from->sb_shared_vn;
+ to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+ to->sb_unit = be32_to_cpu(from->sb_unit);
+ to->sb_width = be32_to_cpu(from->sb_width);
+ to->sb_dirblklog = from->sb_dirblklog;
+ to->sb_logsectlog = from->sb_logsectlog;
+ to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+ to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+ to->sb_features2 = be32_to_cpu(from->sb_features2);
+ to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+ to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+ to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+ to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ be32_to_cpu(from->sb_features_log_incompat);
+ /* crc is only used on disk, not in memory; just init to 0 here. */
+ to->sb_crc = 0;
+ to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
+ to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+ to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /*
+ * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+ * feature flag is set; if not set we keep it only in memory.
+ */
+ if (xfs_sb_is_v5(to) &&
+ (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+ else
+ uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
+ /* Convert on-disk flags to in-memory flags? */
+ if (convert_xquota)
+ xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+ struct xfs_sb *to,
+ struct xfs_dsb *from)
+{
+ __xfs_sb_from_disk(to, from, true);
+}
+
+static void
+xfs_sb_quota_to_disk(
+ struct xfs_dsb *to,
+ struct xfs_sb *from)
+{
+ uint16_t qflags = from->sb_qflags;
+
+ to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+
+ /*
+ * The in-memory superblock quota state matches the v5 on-disk format so
+ * just write them out and return
+ */
+ if (xfs_sb_is_v5(from)) {
+ to->sb_qflags = cpu_to_be16(from->sb_qflags);
+ to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+ to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+ return;
+ }
+
+ /*
+ * For older superblocks (v4), the in-core version of sb_qflags do not
+ * have XFS_OQUOTA_* flags, whereas the on-disk version does. So,
+ * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
+ */
+ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+ XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+ if (from->sb_qflags &
+ (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+ qflags |= XFS_OQUOTA_ENFD;
+ if (from->sb_qflags &
+ (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+ qflags |= XFS_OQUOTA_CHKD;
+ to->sb_qflags = cpu_to_be16(qflags);
+
+ /*
+ * GQUOTINO and PQUOTINO cannot be used together in versions
+ * of superblock that do not have pquotino. from->sb_flags
+ * tells us which quota is active and should be copied to
+ * disk. If neither are active, we should NULL the inode.
+ *
+ * In all cases, the separate pquotino must remain 0 because it
+ * is beyond the "end" of the valid non-pquotino superblock.
+ */
+ if (from->sb_qflags & XFS_GQUOTA_ACCT)
+ to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+ else if (from->sb_qflags & XFS_PQUOTA_ACCT)
+ to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+ else {
+ /*
+ * We can't rely on just the fields being logged to tell us
+ * that it is safe to write NULLFSINO - we should only do that
+ * if quotas are not actually enabled. Hence only write
+ * NULLFSINO if both in-core quota inodes are NULL.
+ */
+ if (from->sb_gquotino == NULLFSINO &&
+ from->sb_pquotino == NULLFSINO)
+ to->sb_gquotino = cpu_to_be64(NULLFSINO);
+ }
+
+ to->sb_pquotino = 0;
+}
+
+void
+xfs_sb_to_disk(
+ struct xfs_dsb *to,
+ struct xfs_sb *from)
+{
+ xfs_sb_quota_to_disk(to, from);
+
+ to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
+ to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+ to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
+ to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
+ to->sb_rextents = cpu_to_be64(from->sb_rextents);
+ memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+ to->sb_logstart = cpu_to_be64(from->sb_logstart);
+ to->sb_rootino = cpu_to_be64(from->sb_rootino);
+ to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
+ to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+ to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
+ to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
+ to->sb_agcount = cpu_to_be32(from->sb_agcount);
+ to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
+ to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
+ to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
+ to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
+ to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
+ to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
+ memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+ to->sb_blocklog = from->sb_blocklog;
+ to->sb_sectlog = from->sb_sectlog;
+ to->sb_inodelog = from->sb_inodelog;
+ to->sb_inopblog = from->sb_inopblog;
+ to->sb_agblklog = from->sb_agblklog;
+ to->sb_rextslog = from->sb_rextslog;
+ to->sb_inprogress = from->sb_inprogress;
+ to->sb_imax_pct = from->sb_imax_pct;
+ to->sb_icount = cpu_to_be64(from->sb_icount);
+ to->sb_ifree = cpu_to_be64(from->sb_ifree);
+ to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+ to->sb_frextents = cpu_to_be64(from->sb_frextents);
+
+ to->sb_flags = from->sb_flags;
+ to->sb_shared_vn = from->sb_shared_vn;
+ to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+ to->sb_unit = cpu_to_be32(from->sb_unit);
+ to->sb_width = cpu_to_be32(from->sb_width);
+ to->sb_dirblklog = from->sb_dirblklog;
+ to->sb_logsectlog = from->sb_logsectlog;
+ to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+ to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+
+ /*
+ * We need to ensure that bad_features2 always matches features2.
+ * Hence we enforce that here rather than having to remember to do it
+ * everywhere else that updates features2.
+ */
+ from->sb_bad_features2 = from->sb_features2;
+ to->sb_features2 = cpu_to_be32(from->sb_features2);
+ to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+
+ if (!xfs_sb_is_v5(from))
+ return;
+
+ to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+ to->sb_features_ro_compat =
+ cpu_to_be32(from->sb_features_ro_compat);
+ to->sb_features_incompat =
+ cpu_to_be32(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ cpu_to_be32(from->sb_features_log_incompat);
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
+ to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid. We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks. So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_sb sb;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dsb *dsb = bp->b_addr;
+ int error;
+
+ /*
+ * open code the version check to avoid needing to convert the entire
+ * superblock from disk order just to check the version number
+ */
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+ (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+ XFS_SB_VERSION_5) ||
+ dsb->sb_crc != 0)) {
+
+ if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+ /* Only fail bad secondaries on a known V5 filesystem */
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR ||
+ xfs_has_crc(mp)) {
+ error = -EFSBADCRC;
+ goto out_error;
+ }
+ }
+ }
+
+ /*
+ * Check all the superblock fields. Don't byteswap the xquota flags
+ * because _verify_common checks the on-disk values.
+ */
+ __xfs_sb_from_disk(&sb, dsb, false);
+ error = xfs_validate_sb_common(mp, bp, &sb);
+ if (error)
+ goto out_error;
+ error = xfs_validate_sb_read(mp, &sb);
+
+out_error:
+ if (error == -EFSCORRUPTED || error == -EFSBADCRC)
+ xfs_verifier_error(bp, error, __this_address);
+ else if (error)
+ xfs_buf_ioerror(bp, error);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_dsb *dsb = bp->b_addr;
+
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+ /* XFS filesystem, verify noisily! */
+ xfs_sb_read_verify(bp);
+ return;
+ }
+ /* quietly fail */
+ xfs_buf_ioerror(bp, -EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_sb sb;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dsb *dsb = bp->b_addr;
+ int error;
+
+ /*
+ * Check all the superblock fields. Don't byteswap the xquota flags
+ * because _verify_common checks the on-disk values.
+ */
+ __xfs_sb_from_disk(&sb, dsb, false);
+ error = xfs_validate_sb_common(mp, bp, &sb);
+ if (error)
+ goto out_error;
+ error = xfs_validate_sb_write(mp, bp, &sb);
+ if (error)
+ goto out_error;
+
+ if (!xfs_sb_is_v5(&sb))
+ return;
+
+ if (bip)
+ dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+ return;
+
+out_error:
+ xfs_verifier_error(bp, error, __this_address);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
+ .verify_read = xfs_sb_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
+ .verify_read = xfs_sb_quiet_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure.
+ *
+ * Inode geometry are calculated in xfs_ialloc_setup_geometry.
+ */
+void
+xfs_sb_mount_common(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ mp->m_agfrotor = mp->m_agirotor = 0;
+ mp->m_maxagi = mp->m_sb.sb_agcount;
+ mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+ mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+ mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+ mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+ mp->m_blockmask = sbp->sb_blocksize - 1;
+ mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+ mp->m_blockwmask = mp->m_blockwsize - 1;
+
+ mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+ mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+ mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+ mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+ mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
+ mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
+ mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
+ mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+
+ mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
+ mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
+ mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+ mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
+ mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
+}
+
+/*
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
+ * into the superblock buffer to be logged. It does not provide the higher
+ * level of locking that is needed to protect the in-core superblock from
+ * concurrent access.
+ */
+void
+xfs_log_sb(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
+
+ /*
+ * Lazy sb counters don't update the in-core superblock so do that now.
+ * If this is at unmount, the counters will be exactly correct, but at
+ * any other time they will only be ballpark correct because of
+ * reservations that have been taken out percpu counters. If we have an
+ * unclean shutdown, this will be corrected by log recovery rebuilding
+ * the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
+ */
+ if (xfs_has_lazysbcount(mp)) {
+ mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ }
+
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+}
+
+/*
+ * xfs_sync_sb
+ *
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+ struct xfs_mount *mp,
+ bool wait)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+ XFS_TRANS_NO_WRITECOUNT, &tp);
+ if (error)
+ return error;
+
+ xfs_log_sb(tp);
+ if (wait)
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
+
+/*
+ * Update all the secondary superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against other
+ * operations that access the secondary superblocks, but we don't want to keep
+ * them in memory once it is written so we mark it as a one-shot buffer.
+ */
+int
+xfs_update_secondary_sbs(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = 1;
+ int saved_error = 0;
+ int error = 0;
+ LIST_HEAD (buffer_list);
+
+ /* update secondary superblocks. */
+ for_each_perag_from(mp, agno, pag) {
+ struct xfs_buf *bp;
+
+ error = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
+ XFS_FSS_TO_BB(mp, 1), &bp);
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
+ if (error) {
+ xfs_warn(mp,
+ "error allocating secondary superblock for ag %d",
+ pag->pag_agno);
+ if (!saved_error)
+ saved_error = error;
+ continue;
+ }
+
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_buf_delwri_queue(bp, &buffer_list);
+ xfs_buf_relse(bp);
+
+ /* don't hold too many buffers at once */
+ if (agno % 16)
+ continue;
+
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, pag->pag_agno);
+ if (!saved_error)
+ saved_error = error;
+ continue;
+ }
+ }
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, agno);
+ }
+
+ return saved_error ? saved_error : error;
+}
+
+/*
+ * Same behavior as xfs_sync_sb, except that it is always synchronous and it
+ * also writes the superblock buffer to disk sector 0 immediately.
+ */
+int
+xfs_sync_sb_buf(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans *tp;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ bp = xfs_trans_getsb(tp);
+ xfs_log_sb(tp);
+ xfs_trans_bhold(tp, bp);
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+ /*
+ * write out the sb buffer to get the changes to disk
+ */
+ error = xfs_bwrite(bp);
+out:
+ xfs_buf_relse(bp);
+ return error;
+}
+
+void
+xfs_fs_geometry(
+ struct xfs_mount *mp,
+ struct xfs_fsop_geom *geo,
+ int struct_version)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+
+ memset(geo, 0, sizeof(struct xfs_fsop_geom));
+
+ geo->blocksize = sbp->sb_blocksize;
+ geo->rtextsize = sbp->sb_rextsize;
+ geo->agblocks = sbp->sb_agblocks;
+ geo->agcount = sbp->sb_agcount;
+ geo->logblocks = sbp->sb_logblocks;
+ geo->sectsize = sbp->sb_sectsize;
+ geo->inodesize = sbp->sb_inodesize;
+ geo->imaxpct = sbp->sb_imax_pct;
+ geo->datablocks = sbp->sb_dblocks;
+ geo->rtblocks = sbp->sb_rblocks;
+ geo->rtextents = sbp->sb_rextents;
+ geo->logstart = sbp->sb_logstart;
+ BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid));
+ memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));
+
+ if (struct_version < 2)
+ return;
+
+ geo->sunit = sbp->sb_unit;
+ geo->swidth = sbp->sb_width;
+
+ if (struct_version < 3)
+ return;
+
+ geo->version = XFS_FSOP_GEOM_VERSION;
+ geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
+ XFS_FSOP_GEOM_FLAGS_EXTFLG;
+ if (xfs_has_attr(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
+ if (xfs_has_quota(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA;
+ if (xfs_has_align(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
+ if (xfs_has_dalign(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
+ if (xfs_has_asciici(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
+ if (xfs_has_lazysbcount(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
+ if (xfs_has_attr2(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
+ if (xfs_has_projid32(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
+ if (xfs_has_crc(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB;
+ if (xfs_has_ftype(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE;
+ if (xfs_has_finobt(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT;
+ if (xfs_has_sparseinodes(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES;
+ if (xfs_has_rmapbt(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
+ if (xfs_has_reflink(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
+ if (xfs_has_bigtime(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
+ if (xfs_has_inobtcounts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
+ if (xfs_has_sector(mp)) {
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
+ geo->logsectsize = sbp->sb_logsectsize;
+ } else {
+ geo->logsectsize = BBSIZE;
+ }
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
+ geo->rtsectsize = sbp->sb_blocksize;
+ geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
+
+ if (struct_version < 4)
+ return;
+
+ if (xfs_has_logv2(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
+
+ geo->logsunit = sbp->sb_logsunit;
+
+ if (struct_version < 5)
+ return;
+
+ geo->version = XFS_FSOP_GEOM_VERSION_V5;
+}
+
+/* Read a secondary superblock. */
+int
+xfs_sb_read_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (error)
+ return error;
+ xfs_buf_set_ref(bp, XFS_SSB_REF);
+ *bpp = bp;
+ return 0;
+}
+
+/* Get an uninitialised secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
+ * so users won't be confused by values in error messages.
+ */
+bool
+xfs_validate_stripe_geometry(
+ struct xfs_mount *mp,
+ __s64 sunit,
+ __s64 swidth,
+ int sectorsize,
+ bool silent)
+{
+ if (swidth > INT_MAX) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe width (%lld) is too large", swidth);
+ return false;
+ }
+
+ if (sunit > swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
+ return false;
+ }
+
+ if (sectorsize && (int)sunit % sectorsize) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe unit (%lld) must be a multiple of the sector size (%d)",
+ sunit, sectorsize);
+ return false;
+ }
+
+ if (sunit && !swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"invalid stripe unit (%lld) and stripe width of 0", sunit);
+ return false;
+ }
+
+ if (!sunit && swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"invalid stripe width (%lld) and stripe unit of 0", swidth);
+ return false;
+ }
+
+ if (sunit && (int)swidth % (int)sunit) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
+ swidth, sunit);
+ return false;
+ }
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
new file mode 100644
index 000000000..a5e14740e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+
+struct xfs_mount;
+struct xfs_sb;
+struct xfs_dsb;
+struct xfs_trans;
+struct xfs_fsop_geom;
+struct xfs_perag;
+
+extern void xfs_log_sb(struct xfs_trans *tp);
+extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern int xfs_sync_sb_buf(struct xfs_mount *mp);
+extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
+extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern bool xfs_sb_good_version(struct xfs_sb *sbp);
+extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
+
+extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
+
+#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
+extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
+ int struct_version);
+extern int xfs_sb_read_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
+extern int xfs_sb_get_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
+
+extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
+ __s64 sunit, __s64 swidth, int sectorsize, bool silent);
+
+#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
new file mode 100644
index 000000000..c4381388c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/* log size calculation functions */
+int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int xfs_log_calc_minimum_size(struct xfs_mount *);
+
+struct xfs_trans_res;
+void xfs_log_get_max_trans_res(struct xfs_mount *mp,
+ struct xfs_trans_res *max_resp);
+
+/*
+ * Values for t_flags.
+ */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
+/*
+ * LOWMODE is used by the allocator to activate the lowspace algorithm - when
+ * free space is running low the extent allocator may choose to allocate an
+ * extent from an AG without leaving sufficient space for a btree split when
+ * inserting the new extent. In this case the allocator will enable the
+ * lowspace algorithm which is supposed to allow further allocations (such as
+ * btree splits and newroots) to allocate from sequential AGs. In order to
+ * avoid locking AGs out of order the lowspace algorithm will start searching
+ * for free space from AG 0. If the correct transaction reservations have been
+ * made then this algorithm will eventually find all the space it needs.
+ */
+#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT 0x00000001
+#define XFS_TRANS_SB_IFREE 0x00000002
+#define XFS_TRANS_SB_FDBLOCKS 0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
+#define XFS_TRANS_SB_FREXTENTS 0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
+#define XFS_TRANS_SB_DBLOCKS 0x00000040
+#define XFS_TRANS_SB_AGCOUNT 0x00000080
+#define XFS_TRANS_SB_IMAXPCT 0x00000100
+#define XFS_TRANS_SB_REXTSIZE 0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
+#define XFS_TRANS_SB_RBLOCKS 0x00000800
+#define XFS_TRANS_SB_REXTENTS 0x00001000
+#define XFS_TRANS_SB_REXTSLOG 0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values. This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define XFS_AGF_REF 4
+#define XFS_AGI_REF 4
+#define XFS_AGFL_REF 3
+#define XFS_INO_BTREE_REF 3
+#define XFS_ALLOC_BTREE_REF 2
+#define XFS_BMAP_BTREE_REF 2
+#define XFS_RMAP_BTREE_REF 2
+#define XFS_DIR_BTREE_REF 2
+#define XFS_INO_REF 2
+#define XFS_ATTR_BTREE_REF 1
+#define XFS_DQUOT_REF 1
+#define XFS_REFC_BTREE_REF 1
+#define XFS_SSB_REF 0
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+
+/* Computed inode geometry for the filesystem. */
+struct xfs_ino_geometry {
+ /* Maximum inode count in this filesystem. */
+ uint64_t maxicount;
+
+ /* Actual inode cluster buffer size, in bytes. */
+ unsigned int inode_cluster_size;
+
+ /*
+ * Desired inode cluster buffer size, in bytes. This value is not
+ * rounded up to at least one filesystem block, which is necessary for
+ * the sole purpose of validating sb_spino_align. Runtime code must
+ * only ever use inode_cluster_size.
+ */
+ unsigned int inode_cluster_size_raw;
+
+ /* Inode cluster sizes, adjusted to be at least 1 fsb. */
+ unsigned int inodes_per_cluster;
+ unsigned int blocks_per_cluster;
+
+ /* Inode cluster alignment. */
+ unsigned int cluster_align;
+ unsigned int cluster_align_inodes;
+ unsigned int inoalign_mask; /* mask sb_inoalignmt if used */
+
+ unsigned int inobt_mxr[2]; /* max inobt btree records */
+ unsigned int inobt_mnr[2]; /* min inobt btree records */
+ unsigned int inobt_maxlevels; /* max inobt btree levels. */
+
+ /* Size of inode allocations under normal operation. */
+ unsigned int ialloc_inos;
+ unsigned int ialloc_blks;
+
+ /* Minimum inode blocks for a sparse allocation. */
+ unsigned int ialloc_min_blks;
+
+ /* stripe unit inode alignment */
+ unsigned int ialloc_align;
+
+ unsigned int agino_log; /* #bits for agino in inum */
+
+ /* precomputed default inode attribute fork offset */
+ unsigned int attr_fork_offset;
+
+ /* precomputed value for di_flags2 */
+ uint64_t new_diflags2;
+
+};
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000..bdc777b9e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+ struct xfs_mount *mp,
+ int pathlen)
+{
+ int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+ return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_has_crc(mp))
+ return 0;
+
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
+ dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+ dsl->sl_offset = cpu_to_be32(offset);
+ dsl->sl_bytes = cpu_to_be32(size);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
+ dsl->sl_owner = cpu_to_be64(ino);
+ dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (offset != be32_to_cpu(dsl->sl_offset))
+ return false;
+ if (size != be32_to_cpu(dsl->sl_bytes))
+ return false;
+ if (ino != be64_to_cpu(dsl->sl_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static xfs_failaddr_t
+xfs_symlink_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_has_crc(mp))
+ return __this_address;
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
+ return __this_address;
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno))
+ return __this_address;
+ if (be32_to_cpu(dsl->sl_offset) +
+ be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN)
+ return __this_address;
+ if (dsl->sl_owner == 0)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return __this_address;
+
+ return NULL;
+}
+
+static void
+xfs_symlink_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_has_crc(mp))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_symlink_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+}
+
+static void
+xfs_symlink_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_has_crc(mp))
+ return;
+
+ fa = xfs_symlink_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (bip) {
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
+ .verify_read = xfs_symlink_read_verify,
+ .verify_write = xfs_symlink_write_verify,
+ .verify_struct = xfs_symlink_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ char *buf;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+
+ if (!xfs_has_crc(mp)) {
+ bp->b_ops = NULL;
+ memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ return;
+ }
+
+ /*
+ * As this symlink fits in an inode literal area, it must also fit in
+ * the smallest buffer the filesystem supports.
+ */
+ ASSERT(BBTOB(bp->b_length) >=
+ ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+ memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
+}
+
+/*
+ * Verify the in-memory consistency of an inline symlink data fork. This
+ * does not do on-disk format checks.
+ */
+xfs_failaddr_t
+xfs_symlink_shortform_verify(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ char *sfp = (char *)ifp->if_u1.if_data;
+ int size = ifp->if_bytes;
+ char *endp = sfp + size;
+
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ /*
+ * Zero length symlinks should never occur in memory as they are
+ * never allowed to exist on disk.
+ */
+ if (!size)
+ return __this_address;
+
+ /* No negative sizes or overly long symlink targets. */
+ if (size < 0 || size > XFS_SYMLINK_MAXLEN)
+ return __this_address;
+
+ /* No NULLs in the target either. */
+ if (memchr(sfp, 0, size - 1))
+ return __this_address;
+
+ /* We /did/ null-terminate the buffer, right? */
+ if (*endp != 0)
+ return __this_address;
+ return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
new file mode 100644
index 000000000..8b5547073
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+
+#include <linux/iversion.h>
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
+ */
+void
+xfs_trans_ijoin(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint lock_flags)
+{
+ struct xfs_inode_log_item *iip;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (ip->i_itemp == NULL)
+ xfs_inode_item_init(ip, ip->i_mount);
+ iip = ip->i_itemp;
+
+ ASSERT(iip->ili_lock_flags == 0);
+ iip->ili_lock_flags = lock_flags;
+ ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &iip->ili_item);
+}
+
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int flags)
+{
+ struct inode *inode = VFS_I(ip);
+ struct timespec64 tv;
+
+ ASSERT(tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ tv = current_time(inode);
+
+ if (flags & XFS_ICHGTIME_MOD)
+ inode->i_mtime = tv;
+ if (flags & XFS_ICHGTIME_CHG)
+ inode->i_ctime = tv;
+ if (flags & XFS_ICHGTIME_CREATE)
+ ip->i_crtime = tv;
+}
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing to be
+ * logged when the transaction is committed. The inode must already be
+ * associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h. We always log all
+ * of the core inode if any of it has changed, and we always log all of the
+ * inline data/extents/b-tree root if any of them has changed.
+ *
+ * Grab and pin the cluster buffer associated with this inode to avoid RMW
+ * cycles at inode writeback time. Avoid the need to add error handling to every
+ * xfs_trans_log_inode() call by shutting down on read error. This will cause
+ * transactions to fail and everything to error out, just like if we return a
+ * read error in a dirty transaction and cancel it.
+ */
+void
+xfs_trans_log_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint flags)
+{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct inode *inode = VFS_I(ip);
+ uint iversion_flags = 0;
+
+ ASSERT(iip);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+
+ /*
+ * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
+ * don't matter - we either will need an extra transaction in 24 hours
+ * to log the timestamps, or will clear already cleared fields in the
+ * worst case.
+ */
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
+ * First time we log the inode in a transaction, bump the inode change
+ * counter if it is configured for this to occur. While we have the
+ * inode locked exclusively for metadata modification, we can usually
+ * avoid setting XFS_ILOG_CORE if no one has queried the value since
+ * the last time it was incremented. If we have XFS_ILOG_CORE already
+ * set however, then go ahead and bump the i_version counter
+ * unconditionally.
+ */
+ if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+ if (IS_I_VERSION(inode) &&
+ inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+ iversion_flags = XFS_ILOG_CORE;
+ }
+
+ /*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_has_bigtime(ip->i_mount) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Record the specific change for fdatasync optimisation. This allows
+ * fdatasync to skip log forces for inodes that are only timestamp
+ * dirty.
+ */
+ spin_lock(&iip->ili_lock);
+ iip->ili_fsync_fields |= flags;
+
+ if (!iip->ili_item.li_buf) {
+ struct xfs_buf *bp;
+ int error;
+
+ /*
+ * We hold the ILOCK here, so this inode is not going to be
+ * flushed while we are here. Further, because there is no
+ * buffer attached to the item, we know that there is no IO in
+ * progress, so nothing will clear the ili_fields while we read
+ * in the buffer. Hence we can safely drop the spin lock and
+ * read the buffer knowing that the state will not change from
+ * here.
+ */
+ spin_unlock(&iip->ili_lock);
+ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
+ if (error) {
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ /*
+ * We need an explicit buffer reference for the log item but
+ * don't want the buffer to remain attached to the transaction.
+ * Hold the buffer but release the transaction reference once
+ * we've attached the inode log item to the buffer log item
+ * list.
+ */
+ xfs_buf_hold(bp);
+ spin_lock(&iip->ili_lock);
+ iip->ili_item.li_buf = bp;
+ bp->b_flags |= _XBF_INODES;
+ list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ /*
+ * Always OR in the bits from the ili_last_fields field. This is to
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
+ */
+ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+ spin_unlock(&iip->ili_lock);
+}
+
+int
+xfs_trans_roll_inode(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_roll(tpp);
+ if (!error)
+ xfs_trans_ijoin(*tpp, ip, 0);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000..5b2f27cbd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,1028 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+
+#define _ALLOC true
+#define _FREE false
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer. Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+ return round_up(sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction. size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+ uint nbufs,
+ uint size)
+{
+ return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating an extent. In classic XFS there were two trees that will be
+ * modified (bnobt + cntbt). With rmap enabled, there are three trees
+ * (rmapbt). The number of blocks reserved is based on the formula:
+ *
+ * num trees * ((2 blocks/level * max depth) - 1)
+ *
+ * Keep in mind that max depth is calculated separately for each type of tree.
+ */
+uint
+xfs_allocfree_block_count(
+ struct xfs_mount *mp,
+ uint num_ops)
+{
+ uint blocks;
+
+ blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
+ if (xfs_has_rmapbt(mp))
+ blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+
+ return blocks;
+}
+
+/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ * - for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ * - the btree data contained by both forks will fit into the inode size,
+ * hence when combined with the inode core above, we have a total of the
+ * actual inode size.
+ * - the BMBT headers need to be accounted separately, as they are
+ * additional to the records and pointers that fit inside the inode
+ * forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+ struct xfs_mount *mp,
+ uint ninodes)
+{
+ return ninodes *
+ (4 * sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_inode_log_format) +
+ mp->m_sb.sb_inodesize +
+ 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * Inode btree record insertion/removal modifies the inode btree and free space
+ * btrees (since the inobt does not use the agfl). This requires the following
+ * reservation:
+ *
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ *
+ * The caller must account for SB and AG header modifications, etc.
+ */
+STATIC uint
+xfs_calc_inobt_res(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * The free inode btree is a conditional feature. The behavior differs slightly
+ * from that of the traditional inode btree in that the finobt tracks records
+ * for inode chunks with at least one free inode. A record can be removed from
+ * the tree during individual inode allocation. Therefore the finobt
+ * reservation is unconditional for both the inode chunk allocation and
+ * individual inode allocation (modify) cases.
+ *
+ * Behavior aside, the reservation for finobt modification is equivalent to the
+ * traditional inobt: cover a full finobt shape change plus block allocation.
+ */
+STATIC uint
+xfs_calc_finobt_res(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_finobt(mp))
+ return 0;
+
+ return xfs_calc_inobt_res(mp);
+}
+
+/*
+ * Calculate the reservation required to allocate or free an inode chunk. This
+ * includes:
+ *
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the inode chunk: m_ino_geo.ialloc_blks * N
+ *
+ * The size N of the inode chunk reservation depends on whether it is for
+ * allocation or free and which type of create transaction is in use. An inode
+ * chunk free always invalidates the buffers and only requires reservation for
+ * headers (N == 0). An inode chunk allocation requires a chunk sized
+ * reservation on v4 and older superblocks to initialize the chunk. No chunk
+ * reservation is required for allocation on v5 supers, which use ordered
+ * buffers to initialize.
+ */
+STATIC uint
+xfs_calc_inode_chunk_res(
+ struct xfs_mount *mp,
+ bool alloc)
+{
+ uint res, size = 0;
+
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+ if (alloc) {
+ /* icreate tx uses ordered buffers */
+ if (xfs_has_v3inodes(mp))
+ return res;
+ size = XFS_FSB_TO_B(mp, 1);
+ }
+
+ res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size);
+ return res;
+}
+
+/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating a realtime extent. We have to be able to log as many rtbitmap
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
+ */
+static unsigned int
+xfs_rtalloc_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+ unsigned int rtbmp_bytes;
+
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate. Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note: Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_defer_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual. In order to fix this we need to change xfs_defer_finish() to free
+ * extents in only a single AG at a time. This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction. See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents. This gives (t1):
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * Or, if we're writing to a realtime file (t2):
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 1 block
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join (t3):
+ * the agfs of the ags containing the blocks: 2 * sector size
+ * the agfls of the ags containing the blocks: 2 * sector size
+ * the super block free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
+ */
+STATIC uint
+xfs_calc_write_reservation(
+ struct xfs_mount *mp,
+ bool for_minlogsize)
+{
+ unsigned int t1, t2, t3, t4;
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ t1 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+
+ if (xfs_has_realtime(mp)) {
+ t2 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ blksz) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
+ } else {
+ t2 = 0;
+ }
+
+ t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
+}
+
+/*
+ * In truncating a file we free up to two extents at once. We can modify (t1):
+ * the inode being truncated: inode size
+ * the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks (t2):
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+ struct xfs_mount *mp,
+ bool for_minlogsize)
+{
+ unsigned int t1, t2, t3, t4;
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ t1 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
+
+ t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
+
+ if (xfs_has_realtime(mp)) {
+ t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
+ } else {
+ t3 = 0;
+ }
+
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
+}
+
+/*
+ * In renaming a files we can modify:
+ * the five inodes involved: 5 * inode size
+ * the two directory btrees: 2 * (max depth + v2) * dir block size
+ * the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ * of bmap blocks) giving:
+ * the agf for the ags in which the blocks live: 3 * sector size
+ * the agfl for the ags in which the blocks live: 3 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ max((xfs_calc_inode_res(mp, 5) +
+ xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ * the agi hash list and counters: sector size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ * the on disk inode in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ 2 * M_IGEO(mp)->inode_cluster_size;
+}
+
+/*
+ * For creating a link to an inode:
+ * the parent directory inode: inode size
+ * the linked inode: inode size
+ * the directory btree could split: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ * the agf for the ag in which the blocks live: sector size
+ * the agfl for the ag in which the blocks live: sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_iunlink_remove_reservation(mp) +
+ max((xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ * the agi hash list: sector size
+ * the on disk inode: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ M_IGEO(mp)->inode_cluster_size;
+}
+
+/*
+ * For removing a directory entry we can modify:
+ * the parent directory inode: inode size
+ * the removed inode: inode size
+ * the directory btree could join: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_iunlink_add_reservation(mp) +
+ max((xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp);
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode chunk (allocation, optional init)
+ * the inobt (record insertion)
+ * the finobt (optional, record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_inode_chunk_res(mp, _ALLOC) +
+ xfs_calc_inobt_res(mp) +
+ xfs_calc_finobt_res(mp);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ max(xfs_calc_icreate_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+ struct xfs_mount *mp)
+{
+ uint res = XFS_DQUOT_LOGRES(mp);
+
+ res += xfs_calc_icreate_resv_alloc(mp);
+ return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_icreate_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (XFS_SYMLINK_MAXLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_icreate_reservation(mp) +
+ xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ * the inode being freed: inode size
+ * the super block free inode counter, AGF and AGFL: sector size
+ * the on disk inode (agi unlinked list removal)
+ * the inode chunk (invalidated, headers only)
+ * the inode btree
+ * the finobt (record insertion, removal or modification)
+ *
+ * Note that the inode chunk res. includes an allocfree res. for freeing of the
+ * inode chunk. This is technically extraneous because the inode chunk free is
+ * deferred (it occurs after a transaction roll). Include the extra reservation
+ * anyways since we've had reports of ifree transaction overruns due to too many
+ * agfl fixups during inode chunk frees.
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_iunlink_remove_reservation(mp) +
+ xfs_calc_inode_chunk_res(mp, _FREE) +
+ xfs_calc_inobt_res(mp) +
+ xfs_calc_finobt_res(mp);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ * superblock
+ * agi and agf
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ * superblock: sector size
+ * agf of the ag from which the extent is allocated: sector size
+ * bmap btree for bitmap/summary inode: max depth * blocksize
+ * bitmap/summary inode: inode size
+ * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ * one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ * superblock: sector size
+ * bitmap inode: inode size
+ * summary inode: inode size
+ * one bitmap block: blocksize
+ * summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ * inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ * inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ * the inode being converted: inode size
+ * agf block and superblock (for block allocation)
+ * the new block (directory sized)
+ * bmap blocks for the new directory block
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+ xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ * the inode being truncated: inode size
+ * the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+ struct xfs_mount *mp)
+{
+ return max((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ * the inode getting the attribute
+ * the superblock for allocations
+ * the agfs extents are allocated from
+ * the attribute btree * max depth
+ * the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * the superblock for allocations: sector size
+ * the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ * ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ * the inode: inode size
+ * the attribute btree could join: max depth * block size
+ * the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ max((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+ XFS_FSB_TO_B(mp, 1)) +
+ (uint)XFS_FSB_TO_B(mp,
+ XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ * the disk quota buffer: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(void)
+{
+ return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ * the write transaction log space for quota file extent allocation
+ * the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+ struct xfs_mount *mp,
+ bool for_minlogsize)
+{
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
+ xfs_calc_buf_res(1,
+ XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ * the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ int logcount_adj = 0;
+
+ /*
+ * The following transactions are logged in physical format and
+ * require a permanent reservation on space.
+ */
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+ resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+ resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+ resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+ resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+ resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+ resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+ resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+ resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp);
+ resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+ resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_create_tmpfile.tr_logres =
+ xfs_calc_create_tmpfile_reservation(mp);
+ resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+ resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+ resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+ resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+ resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+ resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+ resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+ resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+ resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+ resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+ resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+ resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+ resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+ resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+ resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ /*
+ * The following transactions are logged in logical format with
+ * a default log count.
+ */
+ resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation();
+ resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+ resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ /* growdata requires permanent res; it can free space to the last AG */
+ resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+ resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+ resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ /* The following transaction are logged in logical format */
+ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+ resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+ resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+ resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+ resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+ resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+ resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644
index 000000000..0554b9d77
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TRANS_RESV_H__
+#define __XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+ uint tr_logres; /* log space unit in bytes per log ticket */
+ int tr_logcount; /* number of log operations per log ticket */
+ int tr_logflags; /* log flags, currently only used for indicating
+ * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+ struct xfs_trans_res tr_write; /* extent alloc trans */
+ struct xfs_trans_res tr_itruncate; /* truncate trans */
+ struct xfs_trans_res tr_rename; /* rename trans */
+ struct xfs_trans_res tr_link; /* link trans */
+ struct xfs_trans_res tr_remove; /* unlink trans */
+ struct xfs_trans_res tr_symlink; /* symlink trans */
+ struct xfs_trans_res tr_create; /* create trans */
+ struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
+ struct xfs_trans_res tr_mkdir; /* mkdir trans */
+ struct xfs_trans_res tr_ifree; /* inode free trans */
+ struct xfs_trans_res tr_ichange; /* inode update trans */
+ struct xfs_trans_res tr_growdata; /* fs data section grow trans */
+ struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
+ struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
+ struct xfs_trans_res tr_attrinval; /* attr fork buffer
+ * invalidation */
+ struct xfs_trans_res tr_attrsetm; /* set/create an attribute at
+ * mount time */
+ struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at
+ * runtime */
+ struct xfs_trans_res tr_attrrm; /* remove an attribute */
+ struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */
+ struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
+ struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
+ struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
+ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
+ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
+ struct xfs_trans_res tr_sb; /* modify superblock */
+ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp) (&(mp)->m_resv)
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp) \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+ (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT 1
+#define XFS_DEFAULT_PERM_LOG_COUNT 2
+#define XFS_ITRUNCATE_LOG_COUNT 2
+#define XFS_INACTIVE_LOG_COUNT 2
+#define XFS_CREATE_LOG_COUNT 2
+#define XFS_CREATE_TMPFILE_LOG_COUNT 2
+#define XFS_MKDIR_LOG_COUNT 3
+#define XFS_SYMLINK_LOG_COUNT 3
+#define XFS_REMOVE_LOG_COUNT 2
+#define XFS_LINK_LOG_COUNT 2
+#define XFS_RENAME_LOG_COUNT 2
+#define XFS_WRITE_LOG_COUNT 2
+#define XFS_ADDAFORK_LOG_COUNT 2
+#define XFS_ATTRINVAL_LOG_COUNT 1
+#define XFS_ATTRSET_LOG_COUNT 3
+#define XFS_ATTRRM_LOG_COUNT 3
+
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
+
+#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644
index 000000000..87b31c69a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+
+/* Worst case number of rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
+ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ XFS_RMAPADD_SPACE_RES(mp))
+
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
+ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+ XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+ (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+ XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
+#define XFS_DAENTER_1B(mp,w) \
+ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+#define XFS_DAENTER_DBS(mp,w) \
+ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w) \
+ (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w) \
+ XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w) \
+ (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w) \
+ (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1
+#define XFS_DIRENTER_SPACE_RES(mp,nl) \
+ (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+ XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp) \
+ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp) \
+ (M_IGEO(mp)->ialloc_blks + \
+ ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels))
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp) \
+ ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define XFS_ATTRRM_SPACE_RES(mp) \
+ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v) \
+ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
+ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp) \
+ (2 * (mp)->m_alloc_maxlevels)
+#define XFS_GROWFSRT_SPACE_RES(mp,b) \
+ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl) \
+ XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp) \
+ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+ XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+ XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp) \
+ XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl) \
+ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp) \
+ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
+
+
+#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
new file mode 100644
index 000000000..5c2765934
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+
+
+/*
+ * Verify that an AG block number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agno_agbno(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno)
+{
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+ if (agbno >= eoag)
+ return false;
+ if (agbno <= XFS_AGFL_BLOCK(mp))
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an FS block number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+inline bool
+xfs_verify_fsbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno)
+{
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+}
+
+/*
+ * Verify that a data device extent is fully contained inside the filesystem,
+ * does not cross an AG boundary, and does not point at static metadata.
+ */
+bool
+xfs_verify_fsbext(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ xfs_fsblock_t len)
+{
+ if (fsbno + len <= fsbno)
+ return false;
+
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return false;
+
+ if (!xfs_verify_fsbno(mp, fsbno + len - 1))
+ return false;
+
+ return XFS_FSB_TO_AGNO(mp, fsbno) ==
+ XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agno_agino(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ xfs_agino_t first;
+ xfs_agino_t last;
+
+ xfs_agino_range(mp, agno, &first, &last);
+ return agino >= first && agino <= last;
+}
+
+/*
+ * Verify that an FS inode number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+inline bool
+xfs_verify_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
+ return false;
+ return xfs_verify_agno_agino(mp, agno, agino);
+}
+
+/* Is this an internal inode number? */
+inline bool
+xfs_internal_inum(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+ (xfs_has_quota(mp) &&
+ xfs_is_quota_inode(&mp->m_sb, ino));
+}
+
+/*
+ * Verify that a directory entry's inode number doesn't point at an internal
+ * inode, empty space, or static AG metadata.
+ */
+bool
+xfs_verify_dir_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ if (xfs_internal_inum(mp, ino))
+ return false;
+ return xfs_verify_ino(mp, ino);
+}
+
+/*
+ * Verify that an realtime block number pointer doesn't point off the
+ * end of the realtime device.
+ */
+inline bool
+xfs_verify_rtbno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rtbno < mp->m_sb.sb_rblocks;
+}
+
+/* Verify that a realtime device extent is fully contained inside the volume. */
+bool
+xfs_verify_rtext(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno,
+ xfs_rtblock_t len)
+{
+ if (rtbno + len <= rtbno)
+ return false;
+
+ if (!xfs_verify_rtbno(mp, rtbno))
+ return false;
+
+ return xfs_verify_rtbno(mp, rtbno + len - 1);
+}
+
+/* Calculate the range of valid icount values. */
+inline void
+xfs_icount_range(
+ struct xfs_mount *mp,
+ unsigned long long *min,
+ unsigned long long *max)
+{
+ unsigned long long nr_inos = 0;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ /* root, rtbitmap, rtsum all live in the first chunk */
+ *min = XFS_INODES_PER_CHUNK;
+
+ for_each_perag(mp, agno, pag)
+ nr_inos += pag->agino_max - pag->agino_min + 1;
+ *max = nr_inos;
+}
+
+/* Sanity-checking of inode counts. */
+bool
+xfs_verify_icount(
+ struct xfs_mount *mp,
+ unsigned long long icount)
+{
+ unsigned long long min, max;
+
+ xfs_icount_range(mp, &min, &max);
+ return icount >= min && icount <= max;
+}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
+
+/* Check that a file block offset does not exceed the maximum. */
+bool
+xfs_verify_fileoff(
+ struct xfs_mount *mp,
+ xfs_fileoff_t off)
+{
+ return off <= XFS_MAX_FILEOFF;
+}
+
+/* Check that a range of file block offsets do not exceed the maximum. */
+bool
+xfs_verify_fileext(
+ struct xfs_mount *mp,
+ xfs_fileoff_t off,
+ xfs_fileoff_t len)
+{
+ if (off + len <= off)
+ return false;
+
+ if (!xfs_verify_fileoff(mp, off))
+ return false;
+
+ return xfs_verify_fileoff(mp, off + len - 1);
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
new file mode 100644
index 000000000..5ebdda7e1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TYPES_H__
+#define __XFS_TYPES_H__
+
+typedef uint32_t prid_t; /* project ID */
+
+typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
+typedef uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef uint32_t xfs_agnumber_t; /* allocation group number */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef int64_t xfs_fsize_t; /* bytes in a file */
+typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
+
+typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
+
+typedef int64_t xfs_lsn_t; /* log sequence number */
+typedef int64_t xfs_csn_t; /* CIL sequence number */
+
+typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
+typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
+
+typedef uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
+typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
+typedef uint64_t xfs_fileoff_t; /* block number in a file */
+typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
+
+typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+
+/*
+ * New verifiers will return the instruction address of the failing check.
+ * NULL means everything is ok.
+ */
+typedef void * xfs_failaddr_t;
+
+/*
+ * Null values for the types.
+ */
+#define NULLFSBLOCK ((xfs_fsblock_t)-1)
+#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
+#define NULLRTBLOCK ((xfs_rtblock_t)-1)
+#define NULLFILEOFF ((xfs_fileoff_t)-1)
+
+#define NULLAGBLOCK ((xfs_agblock_t)-1)
+#define NULLAGNUMBER ((xfs_agnumber_t)-1)
+
+#define NULLCOMMITLSN ((xfs_lsn_t)-1)
+
+#define NULLFSINO ((xfs_ino_t)-1)
+#define NULLAGINO ((xfs_agino_t)-1)
+
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
+ */
+#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
+#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
+
+/*
+ * Inode fork identifiers.
+ */
+#define XFS_DATA_FORK 0
+#define XFS_ATTR_FORK 1
+#define XFS_COW_FORK 2
+
+#define XFS_WHICHFORK_STRINGS \
+ { XFS_DATA_FORK, "data" }, \
+ { XFS_ATTR_FORK, "attr" }, \
+ { XFS_COW_FORK, "cow" }
+
+/*
+ * Min numbers of data/attr fork btree root pointers.
+ */
+#define MINDBTPTRS 3
+#define MINABTPTRS 2
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN 256
+
+/*
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
+typedef enum {
+ XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+ { XFS_LOOKUP_EQi, "eq" }, \
+ { XFS_LOOKUP_LEi, "le" }, \
+ { XFS_LOOKUP_GEi, "ge" }
+
+/*
+ * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
+ * please keep the TRACE_DEFINE_ENUMs for it up to date.
+ */
+typedef enum {
+ XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
+ XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+#define XFS_BTNUM_STRINGS \
+ { XFS_BTNUM_BNOi, "bnobt" }, \
+ { XFS_BTNUM_CNTi, "cntbt" }, \
+ { XFS_BTNUM_RMAPi, "rmapbt" }, \
+ { XFS_BTNUM_BMAPi, "bmbt" }, \
+ { XFS_BTNUM_INOi, "inobt" }, \
+ { XFS_BTNUM_FINOi, "finobt" }, \
+ { XFS_BTNUM_REFCi, "refcbt" }
+
+struct xfs_name {
+ const unsigned char *name;
+ int len;
+ int type;
+};
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef uint32_t xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG 3 /* log2(NBBY) */
+#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD (1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
+
+struct xfs_iext_cursor {
+ struct xfs_iext_leaf *leaf;
+ int pos;
+};
+
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+} xfs_exntst_t;
+
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
+
+enum xfs_refc_domain {
+ XFS_REFC_DOMAIN_SHARED = 0,
+ XFS_REFC_DOMAIN_COW,
+};
+
+#define XFS_REFC_DOMAIN_STRINGS \
+ { XFS_REFC_DOMAIN_SHARED, "shared" }, \
+ { XFS_REFC_DOMAIN_COW, "cow" }
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+ enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */
+};
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ uint64_t rm_owner; /* extent owner */
+ uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+ XFS_AG_RESV_NONE = 0,
+ XFS_AG_RESV_AGFL,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_RMAPBT,
+};
+
+/*
+ * Type verifier functions
+ */
+struct xfs_mount;
+
+bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
+ xfs_fsblock_t len);
+
+bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
+bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+ xfs_rtblock_t len);
+bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
+void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
+ unsigned long long *max);
+bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
+bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
+ xfs_fileoff_t len);
+
+#endif /* __XFS_TYPES_H__ */